From 96273316de62bd0924835c39dacc95b24e46fa14 Mon Sep 17 00:00:00 2001
From: Eric Kofman <ekofman@tscc-1-27.tscc.sdsc.edu>
Date: Fri, 20 Dec 2024 14:54:07 -0800
Subject: [PATCH] clean up code

---
 marine.py                     | 477 ++++------------------------------
 src/core.py                   | 236 ++++++++++++++++-
 src/utils.py                  | 221 +++++++++++++++-
 tests/integration_tests.ipynb |  38 +--
 4 files changed, 511 insertions(+), 461 deletions(-)

diff --git a/marine.py b/marine.py
index c4e0d74..6f98f13 100755
--- a/marine.py
+++ b/marine.py
@@ -19,291 +19,23 @@
 from matplotlib import pyplot as plt
 import math
 import shlex
-import scipy.sparse as sp
-import anndata as ad
 import os
 
 
-# checkpoint 
-
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'src/'))
 
-from read_process import incorporate_replaced_pos_info,incorporate_insertions_and_deletions,\
-get_positions_from_md_tag,reverse_complement,get_edit_information,get_edit_information_wrapper,\
-has_edits,get_total_coverage_for_contig_at_position,\
-print_read_info, get_read_information, get_hamming_distance, \
-remove_softclipped_bases,find
-
-from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \
-write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, \
-pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written, \
-make_depth_command_script_single_cell, generate_and_run_bash_merge, get_sailor_sites, \
-concatenate_files, run_command, get_edits_with_coverage_df, zero_edit_found, delete_intermediate_files
+from utils import make_folder, pretty_print, make_depth_command_script_single_cell, \
+concatenate_files, get_edits_with_coverage_df, zero_edit_found, delete_intermediate_files, \
+pivot_edits_to_sparse, print_marine_logo, convert_sites_to_sailor, convert_conversions_argument, \
+generate_bedgraphs, check_folder_is_empty_warn_if_not, print_all_cells_coverage_warning 
 
-from core import run_edit_identifier, run_bam_reconfiguration, \
-gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information
+from core import run_edit_identifier, run_bam_reconfiguration, run_edit_finding, \
+gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information, \
+generate_depths
 
 from annotate import annotate_sites, get_strand_specific_conversion 
 
 
-def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[],
-                verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):
-    
-    pretty_print("Each contig is being split into subsets of length...".format(interval_length))
-    
-    overall_label_to_list_of_contents, results, overall_time, overall_total_reads, \
-    total_seconds_for_reads, counts_summary_df = run_edit_identifier(
-        bam_filepath, 
-        output_folder, 
-        strandedness=strandedness,
-        barcode_tag=barcode_tag,
-        barcode_whitelist=barcode_whitelist,
-        contigs=contigs,
-        verbose=verbose,
-        cores=cores,
-        min_read_quality=min_read_quality,
-        min_base_quality=min_base_quality,
-        dist_from_end=dist_from_end,
-        interval_length=interval_length
-    )
-    
-    #print(overall_label_to_list_of_contents.keys())
-    #print(overall_label_to_list_of_contents.get(list(overall_label_to_list_of_contents.keys())[0]))
-    
-    pretty_print(
-        [
-            "Reads processed:\t{}".format(overall_total_reads), 
-            "Time to process reads in min:\t{}".format(round(overall_time/60, 5)),
-            "Read Summary:\n{}".format(counts_summary_df)
-        ],
-        style="-"
-    )
-    
-    
-    total_seconds_for_reads_df = pd.DataFrame.from_dict(total_seconds_for_reads, orient='index')
-    total_seconds_for_reads_df.columns = ['seconds']
-    total_seconds_for_reads_df['reads'] = total_seconds_for_reads_df.index
-    total_seconds_for_reads_df.index = range(len(total_seconds_for_reads_df))
-    
-    
-    return overall_label_to_list_of_contents, results, total_seconds_for_reads_df, overall_total_reads, counts_summary_df
-
-    
-def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag='CB', cores=1, number_of_expected_bams=4,
-                   verbose=False):
-    # Only used for single-cell and/or long read reconfiguration of bams to optimize coverage calculation
-    split_bams_folder = '{}/split_bams'.format(output_folder)
-    make_folder(split_bams_folder)
-    contigs_to_generate_bams_for = get_contigs_that_need_bams_written(list(overall_label_to_list_of_contents.keys()),
-                                                                      split_bams_folder, 
-                                                                      barcode_tag=barcode_tag,
-                                                                    number_of_expected_bams=number_of_expected_bams
-                                                                     )
-    if verbose:
-        pretty_print("Will split and reconfigure the following contigs: {}".format(",".join(contigs_to_generate_bams_for)))
-    
-    
-    # BAM Generation
-    total_bam_generation_time, total_seconds_for_bams = run_bam_reconfiguration(split_bams_folder, bam_filepath, overall_label_to_list_of_contents, contigs_to_generate_bams_for, barcode_tag=barcode_tag, cores=cores, 
-                                                                                number_of_expected_bams=number_of_expected_bams,
-                                                                                verbose=verbose)
-    
-    total_seconds_for_bams_df = pd.DataFrame.from_dict(total_seconds_for_bams, orient='index')
-    total_seconds_for_bams_df.columns = ['seconds']
-    total_seconds_for_bams_df['contigs'] = total_seconds_for_bams_df.index
-    total_seconds_for_bams_df.index = range(len(total_seconds_for_bams_df))
-    
-    return total_bam_generation_time, total_seconds_for_bams_df
-    
-    
-def print_marine_logo():
-    logo_lines = [
-    "::::    ::::      :::     :::::::::  ::::::::::: ::::    ::: :::::::::: ",
-    "+:+:+: :+:+:+   :+: :+:   :+:    :+:     :+:     :+:+:   :+: :+:        ",
-    "+:+ +:+:+ +:+  +:+   +:+  +:+    +:+     +:+     :+:+:+  +:+ +:+        ",
-    "+#+  +:+  +#+ +#++:++#++: +#++:++#:      +#+     +#+ +:+ +#+ +#++:++#   ",
-    "+#+       +#+ +#+     +#+ +#+    +#+     +#+     +#+  +#+#+# +#+        ",
-    "#+#       #+# #+#     #+# #+#    #+#     #+#     #+#   #+#+# #+#        ",
-    "###       ### ###     ### ###    ### ########### ###    #### ########## "
-    ]
-    for l in logo_lines:
-        pretty_print(l)
-        
-    pretty_print("Multi-core Algorithm for Rapid Identification of Nucleotide Edits", style="=")
-
-
-def get_broken_up_contigs(contigs, num_per_sublist):
-    broken_up_contigs = []
-                
-    i_options = range((math.ceil(len(contigs)/num_per_sublist)) + 1)
-    
-    for i in i_options:
-        contig_sublist = []
-        j_options = range(i*num_per_sublist, (i*num_per_sublist) + num_per_sublist)
-        
-        for j in j_options:
-            if j < len(contigs):
-                contig_sublist.append(contigs[j])
-
-        if len(contig_sublist) > 0:
-            broken_up_contigs.append(contig_sublist)
-    return broken_up_contigs
-
-def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix=''):
-    """
-    Split a BED file into multiple files based on suffixes in the first column.
-    Each line is assigned to the appropriate file based on the suffix.
-
-    e.g.:
-    
-    10_AAACGAAAGTCACACT-1   6143263         6143264
-    10_AAACGAAAGTCACACT-1   11912575        11912576
-    10_AAACGAAAGTCACACT-1   12209751        12209752
-    10_AAACGAAAGTCACACT-1   13320235        13320236
-    10_AAACGAAAGTCACACT-1   27036085        27036086
-
-    """
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-
-    single_cell_approach = len(bam_filepaths) > 0
-    
-    suffix_pairs = [
-        (os.path.basename(bam).split("_")[0], 
-         os.path.basename(bam).split("_")[1].split(".")[0]) for bam in bam_filepaths
-    ]
-        
-    # Open file handles for each suffix
-    file_handles = {}
-    for prefix, suffix in suffix_pairs:
-        output_file = os.path.join(output_folder, f"combined_{output_suffix}_{prefix}_{suffix}.bed")
-        file_handles[prefix + suffix] = open(output_file, 'w')
-
-    try:
-        with open(input_bed_file, 'r') as infile:
-            for line in infile:
-                # Parse the first column to determine the suffix
-                columns = line.split()
-                
-                chrom = columns[0]  # Assuming the first column is the chromosome
-                for prefix, suffix in suffix_pairs:
-                    if chrom.startswith(f"{prefix}_") and chrom.endswith(suffix):
-                        file_handles[prefix + suffix].write(line)
-                        break
-
-    finally:
-        # Close all file handles
-        for handle in file_handles.values():
-            handle.close()
-            
-
-def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=None):
-    
-    coverage_start_time = time.perf_counter()
-
-    all_depth_commands = []
-
-    combine_edit_sites_command = (
-        "echo 'concatenating bed file...';"
-        "for file in {}/edit_info/*edit_info.tsv; do "
-        "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; "
-        "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;"
-    ).format(output_folder, output_folder)
-
-    if not os.path.exists(f'{output_folder}/combined_source_cells.bed'):
-        run_command(combine_edit_sites_command)
-    
-    all_depth_commands.append(combine_edit_sites_command)
-
-    output_suffix = 'source_cells'
-    
-    if barcode_tag:
-        coverage_subfolder = '{}/coverage'.format(output_folder)
-        make_folder(coverage_subfolder)
-
-        # Single cell mode
-        split_bed_file(
-            f"{output_folder}/combined_{output_suffix}.bed",
-            f"{output_folder}/combined_{output_suffix}_split_by_suffix",
-            bam_filepaths,
-            output_suffix=output_suffix
-        )
-        
-        make_depth_command_script_single_cell(paired_end, bam_filepaths, output_folder,
-                                  all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores, barcode_tag=barcode_tag)
-        
-    else:
-        if paired_end:
-            paired_end_flag = '-s '
-        else:
-            paired_end_flag = ''
-            
-        # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed
-        samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt"
-        run_command(samtools_depth_command)
-        
-
-    print("Concatenating edit info files...")
-    concatenate_files(output_folder, "edit_info/*edit_info.tsv",
-                      "{}/final_edit_info_no_coverage.tsv".format(output_folder),
-                     run=True)
-
-    print("Append the depth columns to the concatenated final_edit_info file...")
-
-    header_columns = ['barcode', 'contig', 'position', 'ref', 'alt',
-                      'read_id', 'strand', 'coverage']
-
-
-    generate_and_run_bash_merge(output_folder,
-                                '{}/final_edit_info_no_coverage.tsv'.format(output_folder),
-                            '{}/depths_source_cells.txt'.format(output_folder), 
-                            '{}/final_edit_info.tsv'.format(output_folder), 
-                                header_columns, barcode_tag=barcode_tag)
-    
-    coverage_total_time = time.perf_counter() - coverage_start_time
-    
-    total_seconds_for_contig_df = pd.DataFrame({'coverage_total_time': [coverage_total_time]})
-    return coverage_total_time, total_seconds_for_contig_df
-        
-
-def convert_sites_to_sailor(final_site_level_information_df, sailor_list, output_folder, skip_coverage):
-    # Output SAILOR-formatted file for use in FLARE downstream
-    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # 1       629275  629276  0.966040688     2,30    +
-    # 1       629309  629310  2.8306e-05      1,1043  +
-    
-    for conversion in sailor_list:
-        conversion_search = conversion[0] + '>' + conversion[1]
-        
-        print("Generating SAILOR-style bed outputs for conversion {}...".format(conversion))
-        
-        sailor_sites,weird_sites = get_sailor_sites(final_site_level_information_df, conversion_search, skip_coverage=skip_coverage)
-        sailor_sites = sailor_sites.drop_duplicates()
-    
-        print("{} final deduplicated {} SAILOR-formatted sites".format(len(sailor_sites), conversion_search))
-        sailor_sites.to_csv('{}/sailor_style_sites_{}.bed'.format(
-            output_folder, 
-            conversion_search.replace(">", "-")), 
-            header=False,
-            index=False,       
-            sep='\t')
-
-
-def generate_bedgraphs(final_site_level_information_df, conversion_search, output_folder):
-    bedgraph_folder = '{}/bedgraphs'.format(output_folder)
-    make_folder(bedgraph_folder)
-    
-    pretty_print("Making bedgraphs for {} conversions...\n".format(bedgraphs_list))
-    for conversion in bedgraphs_list:
-        conversion_search = conversion[0] + '>' + conversion[1]
-        sites_for_conversion = final_site_level_information_df[final_site_level_information_df.conversion == conversion_search]
-        sites_for_conversion['edit_fraction'] = sites_for_conversion['count']/sites_for_conversion['coverage']
-        sites_for_conversion['start'] = sites_for_conversion['position'] - 1
-        sites_for_conversion_bedgraph_cols = sites_for_conversion[['contig', 'start', 'position', 'edit_fraction']]
-    
-        sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)
-
-
 def prepare_combinations_for_split(df, bam_filepaths, output_folder, output_suffix):
     """
     Prepares the chromosome-suffix combinations for multiprocessing.
@@ -376,46 +108,6 @@ def process_combination_for_split(args):
     print(f"\t\t\t>>> Processed {chrom}, {prefix}_{suffix} -> {output_file}")
 
 
-def pivot_edits_to_sparse(df, output_folder):
-    # Create a new column for contig:position
-    df["CombinedPosition"] = df["contig"].astype(str) + ":" + df["position"].astype(str)
-
-    # Ensure the output directory exists
-    final_output_dir = os.path.join(output_folder, "final_matrix_outputs")
-    os.makedirs(final_output_dir, exist_ok=True)
-
-    for strand_conversion in df.strand_conversion.unique():
-        print(f"Processing strand_conversion: {strand_conversion}")
-
-        # Pivot the dataframe
-        pivoted_df = df[df.strand_conversion == strand_conversion].pivot(
-            index="CombinedPosition", 
-            columns="barcode", 
-            values="count"
-        )
-
-        # Replace NaN with 0 for missing values
-        pivoted_df = pivoted_df.fillna(0)
-
-        # Convert to a sparse matrix
-        sparse_matrix = sp.csr_matrix(pivoted_df.values)
-
-        # Create an AnnData object
-        adata = ad.AnnData(
-            X=sparse_matrix,
-            obs=pd.DataFrame(index=pivoted_df.index),  # Row (site) metadata
-            var=pd.DataFrame(index=pivoted_df.columns)  # Column (barcode) metadata
-        )
-
-        # Save the AnnData object
-        output_file = os.path.join(
-            final_output_dir,
-            f"comprehensive_{strand_conversion.replace('>', '_')}_edits_matrix.h5ad"
-        )
-        adata.write(output_file)
-        print(f"Saved sparse matrix for {strand_conversion} to {output_file}")
-
-        
 def generate_and_split_bed_files_for_all_edits(output_folder, bam_filepaths, tabulation_bed=None, processes=4, output_suffix="all_cells"):
     """
     Generates combined BED files for all edit sites and splits them into suffix-specific files.
@@ -438,9 +130,12 @@ def generate_and_split_bed_files_for_all_edits(output_folder, bam_filepaths, tab
         tabulation_bed_df['contig_position'] = tabulation_bed_df['chrom'].astype(str) + '_' + tabulation_bed_df['start'].astype(str)
         print(f"\t{len(tabulation_bed_df)} unique positions in {tabulation_bed}...")
         print(tabulation_bed_df.head())
+
+        valid_positions = set(df['contig_position'])
+        positions_to_keep = valid_positions.intersection(set(tabulation_bed_df.contig_position))
         
-        df = df[df['contig_position'].isin(set(tabulation_bed_df.contig_position))]
-        print(f"\tRunning {len(df)} positions through all-cell coverage tabulation...")
+        print(f"\t{len(positions_to_keep)} out of {len(valid_positions)} specified positions in {tabulation_bed} are valid")
+        df = df[df['contig_position'].isin(positions_to_keep)]
 
     # Pivot edit dataframe
     print("Pivoting edits dataframe into sparse h5ad files...")
@@ -465,8 +160,7 @@ def generate_and_split_bed_files_for_all_edits(output_folder, bam_filepaths, tab
         pool.map(process_combination_for_split, combinations)
 
     print(f"All split BED files generated in {output_folder}/combined_{output_suffix}_split_by_suffix")
-
-
+        
     
 def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4, 
         keep_intermediate_files=False,
@@ -475,10 +169,6 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         all_cells_coverage=False, tabulation_bed=None
        ):
         
-    # Check to make sure the folder is empty, otherwise prompt for overwriting
-    if any(os.scandir(output_folder)):
-        pretty_print("WARNING: {} is not empty".format(output_folder), style="^")
-    
     logging_folder = "{}/metadata".format(output_folder)
 
     with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
@@ -505,86 +195,33 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
     final_path_already_exists = False
     final_annotated_path_already_exists = False
 
+    # Edit identification
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    pretty_print("Identifying edits", style="~")
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     if os.path.exists(final_filtered_sites_path):
         print("{} exists... skipping edit finding.".format(final_filtered_sites_path))
         final_path_already_exists = True
-
-    # Edit finding
-    if not (coverage_only or filtering_only) and not final_path_already_exists:
-        overall_total_reads_processed = 0
-        if barcode_whitelist_file:
-            barcode_whitelist = read_barcode_whitelist_file(barcode_whitelist_file)
-        else:
-            barcode_whitelist = None
-
-        # Edit identification
-        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-        pretty_print("Identifying edits", style="~")
-        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-        if len(contigs) == 0:
-            # Take care of the case where no contigs are specified, so that all contigs available are processed
-            broken_up_contigs = [[]]
-        else:
-            if barcode_tag:
-                # For single cell sequencing we will only process this many contigs at a time
-                broken_up_contigs = get_broken_up_contigs(contigs, num_per_sublist)
-                    
-            else:
-                # For bulk sequencing we will just process all contigs 
-                broken_up_contigs = [contigs]
-
-        print('Contig groups to be processed:', broken_up_contigs)
-        
-        overall_counts_summary_df = defaultdict(lambda:0)
-        overall_total_reads_processed = 0
-        for subcontig_list in broken_up_contigs:
-                
-            overall_label_to_list_of_contents, results, total_seconds_for_reads_df, total_reads_processed, counts_summary_df = edit_finder(
+    else:
+        if not (coverage_only or filtering_only):
+            run_edit_finding(
+                barcode_tag,
+                barcode_whitelist_file, 
+                contigs, 
+                num_per_sublist,
                 bam_filepath, 
                 output_folder, 
                 strandedness,
-                barcode_tag,
-                barcode_whitelist,
-                subcontig_list,
-                verbose,
-                cores=cores,
-                min_read_quality=min_read_quality,
-                min_base_quality=min_base_quality,
-                dist_from_end=min_dist_from_end,
-                interval_length=interval_length
+                min_read_quality,
+                min_base_quality,
+                min_dist_from_end,
+                interval_length,
+                number_of_expected_bams,
+                cores,
+                logging_folder,
+                verbose
             )
-
-            for k,v in counts_summary_df.items():
-                overall_counts_summary_df[k] += v
-                
-            overall_total_reads_processed += total_reads_processed
             
-            #total_seconds_for_reads_df.to_csv("{}/edit_finder_timing.tsv".format(logging_folder), sep='\t')
-            
-            if barcode_tag:
-                # Make a subfolder into which the split bams will be placed
-                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-                pretty_print("Contigs processed\n\n\t{}".format(sorted(list(overall_label_to_list_of_contents.keys()))))
-                pretty_print("Splitting and reconfiguring BAMs to optimize coverage calculations", style="~")
-                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-                
-                total_bam_generation_time, total_seconds_for_bams_df = bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag=barcode_tag, cores=cores, number_of_expected_bams=number_of_expected_bams, verbose=verbose)
-                #total_seconds_for_bams_df.to_csv("{}/bam_reconfiguration_timing.tsv".format(logging_folder), sep='\t')
-                pretty_print("Total time to concat and write bams: {} minutes".format(round(total_bam_generation_time/60, 3)))
-
-            print("Deleting overall_label_to_list_of_contents...")
-            del overall_label_to_list_of_contents
-
-        
-        with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
-            f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') 
-            for k, v in overall_counts_summary_df.items():
-                f.write(f'{k}\t{v}\n') 
-
-            f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n')
-
     reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder))
         
     if not final_path_already_exists and not skip_coverage:
@@ -594,7 +231,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
         
         # We want to run the samtools depth command for each of the reconfigured bam files
         print("Running samtools depth on {} subset bam paths...".format(len(reconfigured_bam_filepaths)))
-        total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end, barcode_tag=barcode_tag)
+        total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end, barcode_tag=barcode_tag, cores=cores)
                                               
         total_seconds_for_contig_df.to_csv("{}/coverage_calculation_timing.tsv".format(logging_folder), sep='\t')
          
@@ -651,6 +288,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
 
     # Annotation option
     if final_path_already_exists and annotation_bedfile_path:
+        pretty_print("Annotating edits...", style="~")
+
         final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder), 
                                                   sep='\t')
         final_site_level_information_annotated_df = annotate_sites(final_site_level_information_df,
@@ -685,8 +324,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
     print(f'Time elapsed: {time.time()-start_time:.2f}s')
 
     if final_path_already_exists and all_cells_coverage:
+        # Coverage across all cells
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        pretty_print("Calculating coverage across all cells at all positions...", style="~")
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         output_suffix = "all_cells"
-        print("Calculating coverage at all edit sites in all cells...")
     
         # Get the list of BAM file paths
         bam_filepaths = glob(f"{output_folder}/split_bams/*/*.bam")
@@ -802,51 +444,26 @@ def check_samtools():
     interval_length = args.interval_length
     num_per_sublist = args.num_per_sublist
 
+    print_marine_logo()
+    check_folder_is_empty_warn_if_not(output_folder)
 
     # all_cells_coverage only applies for single cell case
     if not barcode_tag:
         if all_cells_coverage == True:
+            print("WARNING: --all_cells_coverage flag only applies for single cell case, ignoring...")
             all_cells_coverage = False
 
-    if all_cells_coverage:
-        print("\n\nWill tabulate coverage across all cells... WARNING this can be extremely resource-consuming if there are a lot of cells and a lot of sites. Consider first filtering sites and then using the --tabulation_bed argument to specify the specific locations you would like tabulated across all cells.\n\n")
-        if tabulation_bed:
-            if os.path.exists(tabulation_bed):
-                print("\t...using sites in {}".format(tabulation_bed))
-            else:
-                print("{} does not exist! Exiting.".format(tabulation_bed))
-                sys.exit(1)
-                
-    # Convert bedgraphs argument into list of conversions
-    if not bedgraphs is None:
-        if barcode_tag in ['CB', 'IB']:
-            sys.stderr.write("Can only output bedgraphs for bulk sequencing runs of MARINE")
-            sys.exit(1)
-            
-        bedgraphs_list = bedgraphs.upper().replace('I', 'G').split(',')
-        for b in bedgraphs_list:
-            assert(b in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG'])
-    else:
-        bedgraphs_list = []
+    print_all_cells_coverage_warning(all_cells_coverage, tabulation_bed)
 
-    if not sailor is None:
-        if barcode_tag in ['CB', 'IB']:
-            sys.stderr.write("Can only output sailor for bulk sequencing runs of MARINE")
-            sys.exit(1)
-            
-        sailor_list = sailor.upper().replace('I', 'G').split(',')
-        for s in sailor_list:
-            assert(s in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG'])
-    else:
-        sailor_list = []
-        
+    bedgraphs_list = convert_conversions_argument(bedgraphs, barcode_tag, file_type='bedgraph')
+    sailor_list = convert_conversions_argument(bedgraphs, barcode_tag, file_type='sailor')
+    
     assert(strandedness in [0, 1, 2])
 
     if not os.path.exists(output_folder):
         pretty_print("{} (output folder) does not exist, making folder...".format(output_folder))
         os.mkdir(output_folder)
 
-    
     # Get the exact command line used to run this script
     command_line = " ".join(shlex.quote(arg) for arg in sys.argv)
     print('command: {}'.format(command_line))
@@ -866,8 +483,6 @@ def check_samtools():
     
     assert(not(coverage_only and filtering_only))
 
-    print_marine_logo()
-
     pretty_print(["Arguments:",
                   "\tBAM filepath:\t{}".format(bam_filepath), 
                   "\tAnnotation bedfile filepath:\t{}".format(annotation_bedfile_path),
diff --git a/src/core.py b/src/core.py
index 8b1afd2..579cd32 100644
--- a/src/core.py
+++ b/src/core.py
@@ -18,11 +18,157 @@
 print_read_info, get_read_information, get_hamming_distance, remove_softclipped_bases,find
 
 from utils import get_contig_lengths_dict, get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \
-write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, make_edit_finding_jobs, pretty_print,\
-get_coverage_wrapper, write_reads_to_file, sort_bam, rm_bam, suffixes
+write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, make_edit_finding_jobs, pretty_print, get_contigs_that_need_bams_written, split_bed_file, \
+get_coverage_wrapper, write_reads_to_file, sort_bam, rm_bam, suffixes, get_broken_up_contigs, run_command, \
+make_depth_command_script_single_cell, concatenate_files, generate_and_run_bash_merge
 
 import os, psutil
 
+
+def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=None, cores=1):
+    
+    coverage_start_time = time.perf_counter()
+
+    all_depth_commands = []
+
+    combine_edit_sites_command = (
+        "echo 'concatenating bed file...';"
+        "for file in {}/edit_info/*edit_info.tsv; do "
+        "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; "
+        "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;"
+    ).format(output_folder, output_folder)
+
+    if not os.path.exists(f'{output_folder}/combined_source_cells.bed'):
+        run_command(combine_edit_sites_command)
+    
+    all_depth_commands.append(combine_edit_sites_command)
+
+    output_suffix = 'source_cells'
+    
+    if barcode_tag:
+        coverage_subfolder = '{}/coverage'.format(output_folder)
+        make_folder(coverage_subfolder)
+
+        # Single cell mode
+        split_bed_file(
+            f"{output_folder}/combined_{output_suffix}.bed",
+            f"{output_folder}/combined_{output_suffix}_split_by_suffix",
+            bam_filepaths,
+            output_suffix=output_suffix
+        )
+        
+        make_depth_command_script_single_cell(paired_end, bam_filepaths, output_folder,
+                                  all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores, barcode_tag=barcode_tag)
+        
+    else:
+        if paired_end:
+            paired_end_flag = '-s '
+        else:
+            paired_end_flag = ''
+            
+        # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed
+        samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt"
+        run_command(samtools_depth_command)
+        
+
+    print("Concatenating edit info files...")
+    concatenate_files(output_folder, "edit_info/*edit_info.tsv",
+                      "{}/final_edit_info_no_coverage.tsv".format(output_folder),
+                     run=True)
+
+    print("Append the depth columns to the concatenated final_edit_info file...")
+
+    header_columns = ['barcode', 'contig', 'position', 'ref', 'alt',
+                      'read_id', 'strand', 'coverage']
+
+
+    generate_and_run_bash_merge(output_folder,
+                                '{}/final_edit_info_no_coverage.tsv'.format(output_folder),
+                            '{}/depths_source_cells.txt'.format(output_folder), 
+                            '{}/final_edit_info.tsv'.format(output_folder), 
+                                header_columns, barcode_tag=barcode_tag)
+    
+    coverage_total_time = time.perf_counter() - coverage_start_time
+    
+    total_seconds_for_contig_df = pd.DataFrame({'coverage_total_time': [coverage_total_time]})
+    return coverage_total_time, total_seconds_for_contig_df
+
+
+def bam_processing(bam_filepath, overall_label_to_list_of_contents, output_folder, barcode_tag='CB', cores=1, number_of_expected_bams=4,
+                   verbose=False):
+    # Only used for single-cell and/or long read reconfiguration of bams to optimize coverage calculation
+    split_bams_folder = '{}/split_bams'.format(output_folder)
+    make_folder(split_bams_folder)
+    contigs_to_generate_bams_for = get_contigs_that_need_bams_written(list(overall_label_to_list_of_contents.keys()),
+                                                                      split_bams_folder, 
+                                                                      barcode_tag=barcode_tag,
+                                                                    number_of_expected_bams=number_of_expected_bams
+                                                                     )
+    if verbose:
+        pretty_print("Will split and reconfigure the following contigs: {}".format(",".join(contigs_to_generate_bams_for)))
+    
+    
+    # BAM Generation
+    total_bam_generation_time, total_seconds_for_bams = run_bam_reconfiguration(
+        split_bams_folder, 
+        bam_filepath, 
+        overall_label_to_list_of_contents, 
+        contigs_to_generate_bams_for, 
+        barcode_tag=barcode_tag, 
+        cores=cores,
+        number_of_expected_bams=number_of_expected_bams,
+        verbose=verbose)
+    
+    total_seconds_for_bams_df = pd.DataFrame.from_dict(total_seconds_for_bams, orient='index')
+    total_seconds_for_bams_df.columns = ['seconds']
+    total_seconds_for_bams_df['contigs'] = total_seconds_for_bams_df.index
+    total_seconds_for_bams_df.index = range(len(total_seconds_for_bams_df))
+    
+    return total_bam_generation_time, total_seconds_for_bams_df
+    
+
+def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[],
+                verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):
+    
+    pretty_print("Each contig is being split into subsets of length...".format(interval_length))
+    
+    overall_label_to_list_of_contents, results, overall_time, overall_total_reads, \
+    total_seconds_for_reads, counts_summary_df = run_edit_identifier(
+        bam_filepath, 
+        output_folder, 
+        strandedness=strandedness,
+        barcode_tag=barcode_tag,
+        barcode_whitelist=barcode_whitelist,
+        contigs=contigs,
+        verbose=verbose,
+        cores=cores,
+        min_read_quality=min_read_quality,
+        min_base_quality=min_base_quality,
+        dist_from_end=dist_from_end,
+        interval_length=interval_length
+    )
+    
+    #print(overall_label_to_list_of_contents.keys())
+    #print(overall_label_to_list_of_contents.get(list(overall_label_to_list_of_contents.keys())[0]))
+    
+    pretty_print(
+        [
+            "Reads processed:\t{}".format(overall_total_reads), 
+            "Time to process reads in min:\t{}".format(round(overall_time/60, 5)),
+            "Read Summary:\n{}".format(counts_summary_df)
+        ],
+        style="-"
+    )
+    
+    
+    total_seconds_for_reads_df = pd.DataFrame.from_dict(total_seconds_for_reads, orient='index')
+    total_seconds_for_reads_df.columns = ['seconds']
+    total_seconds_for_reads_df['reads'] = total_seconds_for_reads_df.index
+    total_seconds_for_reads_df.index = range(len(total_seconds_for_reads_df))
+    
+    
+    return overall_label_to_list_of_contents, results, total_seconds_for_reads_df, overall_total_reads, counts_summary_df
+    
 def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):
 
     # Make subfolder in which to information about edits
@@ -110,6 +256,92 @@ def run_bam_reconfiguration(split_bams_folder, bampath, overall_label_to_list_of
     return total_bam_generation_time, total_seconds_for_bams
 
 
+def run_edit_finding(barcode_tag,
+                     barcode_whitelist_file, 
+                     contigs, 
+                     num_per_sublist,
+                     bam_filepath, 
+                     output_folder, 
+                     strandedness,
+                     min_read_quality,
+                     min_base_quality,
+                     min_dist_from_end,
+                     interval_length,
+                     number_of_expected_bams,
+                     cores,
+                     logging_folder,
+                     verbose=False
+                    ):
+    overall_total_reads_processed = 0
+    if barcode_whitelist_file:
+        barcode_whitelist = read_barcode_whitelist_file(barcode_whitelist_file)
+    else:
+        barcode_whitelist = None
+
+    if len(contigs) == 0:
+        # Take care of the case where no contigs are specified, so that all contigs available are processed
+        broken_up_contigs = [[]]
+    else:
+        if barcode_tag:
+            # For single cell sequencing we will only process this many contigs at a time
+            broken_up_contigs = get_broken_up_contigs(contigs, num_per_sublist)
+                
+        else:
+            # For bulk sequencing we will just process all contigs 
+            broken_up_contigs = [contigs]
+
+    print('Contig groups to be processed:', broken_up_contigs)
+    
+    overall_counts_summary_df = defaultdict(lambda:0)
+    overall_total_reads_processed = 0
+    for subcontig_list in broken_up_contigs:
+            
+        overall_label_to_list_of_contents, results, total_seconds_for_reads_df, total_reads_processed, counts_summary_df = edit_finder(
+            bam_filepath, 
+            output_folder, 
+            strandedness,
+            barcode_tag,
+            barcode_whitelist,
+            subcontig_list,
+            verbose,
+            cores=cores,
+            min_read_quality=min_read_quality,
+            min_base_quality=min_base_quality,
+            dist_from_end=min_dist_from_end,
+            interval_length=interval_length
+        )
+
+        for k,v in counts_summary_df.items():
+            overall_counts_summary_df[k] += v
+            
+        overall_total_reads_processed += total_reads_processed
+        
+        #total_seconds_for_reads_df.to_csv("{}/edit_finder_timing.tsv".format(logging_folder), sep='\t')
+        
+        if barcode_tag:
+            # Make a subfolder into which the split bams will be placed
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            pretty_print("Contigs processed\n\n\t{}".format(sorted(list(overall_label_to_list_of_contents.keys()))))
+            pretty_print("Splitting and reconfiguring BAMs to optimize coverage calculations", style="~")
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+            
+            total_bam_generation_time, total_seconds_for_bams_df = bam_processing(bam_filepath, overall_label_to_list_of_contents, output_folder, barcode_tag=barcode_tag, cores=cores, number_of_expected_bams=number_of_expected_bams, verbose=verbose)
+            #total_seconds_for_bams_df.to_csv("{}/bam_reconfiguration_timing.tsv".format(logging_folder), sep='\t')
+            pretty_print("Total time to concat and write bams: {} minutes".format(round(total_bam_generation_time/60, 3)))
+
+        print("Deleting overall_label_to_list_of_contents...")
+        del overall_label_to_list_of_contents
+
+    
+    with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
+        f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') 
+        for k, v in overall_counts_summary_df.items():
+            f.write(f'{k}\t{v}\n') 
+
+        f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n')
+
+
 def incorporate_barcode(read_as_string, contig, barcode):
     read_tab_separated = read_as_string.split('\t')
     contig_section = '{}_{}'.format(contig, barcode)
diff --git a/src/utils.py b/src/utils.py
index fdb9cd1..76c9f83 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -14,9 +14,29 @@
 from multiprocessing import Pool
 import multiprocessing
 import time
+import scipy.sparse as sp
+import anndata as ad
 
+# Number of barcode characters to use as suffix during splitting 
 CB_N = 1
 
+def print_marine_logo():
+    logo_lines = [
+    "::::    ::::      :::     :::::::::  ::::::::::: ::::    ::: :::::::::: ",
+    "+:+:+: :+:+:+   :+: :+:   :+:    :+:     :+:     :+:+:   :+: :+:        ",
+    "+:+ +:+:+ +:+  +:+   +:+  +:+    +:+     +:+     :+:+:+  +:+ +:+        ",
+    "+#+  +:+  +#+ +#++:++#++: +#++:++#:      +#+     +#+ +:+ +#+ +#++:++#   ",
+    "+#+       +#+ +#+     +#+ +#+    +#+     +#+     +#+  +#+#+# +#+        ",
+    "#+#       #+# #+#     #+# #+#    #+#     #+#     #+#   #+#+# #+#        ",
+    "###       ### ###     ### ###    ### ########### ###    #### ########## "
+    ]
+    for l in logo_lines:
+        pretty_print(l)
+        
+    pretty_print("Multi-core Algorithm for Rapid Identification of Nucleotide Edits", style="=")
+
+
+
 def generate_permutations_list_for_CB(n):
     """
     Generate all permutations of A, C, G, T for strings of length n
@@ -95,6 +115,94 @@ def generate_permutations_list_for_CB(n):
     ]
 }
 
+
+def generate_bedgraphs(final_site_level_information_df, conversion_search, output_folder):
+    bedgraph_folder = '{}/bedgraphs'.format(output_folder)
+    make_folder(bedgraph_folder)
+    
+    pretty_print("Making bedgraphs for {} conversions...\n".format(bedgraphs_list))
+    for conversion in bedgraphs_list:
+        conversion_search = conversion[0] + '>' + conversion[1]
+        sites_for_conversion = final_site_level_information_df[final_site_level_information_df.conversion == conversion_search]
+        sites_for_conversion['edit_fraction'] = sites_for_conversion['count']/sites_for_conversion['coverage']
+        sites_for_conversion['start'] = sites_for_conversion['position'] - 1
+        sites_for_conversion_bedgraph_cols = sites_for_conversion[['contig', 'start', 'position', 'edit_fraction']]
+    
+        sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)
+
+
+
+def convert_sites_to_sailor(final_site_level_information_df, sailor_list, output_folder, skip_coverage):
+    # Output SAILOR-formatted file for use in FLARE downstream
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # 1       629275  629276  0.966040688     2,30    +
+    # 1       629309  629310  2.8306e-05      1,1043  +
+    
+    for conversion in sailor_list:
+        conversion_search = conversion[0] + '>' + conversion[1]
+        
+        print("Generating SAILOR-style bed outputs for conversion {}...".format(conversion))
+        
+        sailor_sites,weird_sites = get_sailor_sites(final_site_level_information_df, conversion_search, skip_coverage=skip_coverage)
+        sailor_sites = sailor_sites.drop_duplicates()
+    
+        print("{} final deduplicated {} SAILOR-formatted sites".format(len(sailor_sites), conversion_search))
+        sailor_sites.to_csv('{}/sailor_style_sites_{}.bed'.format(
+            output_folder, 
+            conversion_search.replace(">", "-")), 
+            header=False,
+            index=False,       
+            sep='\t')
+
+
+
+def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix=''):
+    """
+    Split a BED file into multiple files based on suffixes in the first column.
+    Each line is assigned to the appropriate file based on the suffix.
+
+    e.g.:
+    
+    10_AAACGAAAGTCACACT-1   6143263         6143264
+    10_AAACGAAAGTCACACT-1   11912575        11912576
+    10_AAACGAAAGTCACACT-1   12209751        12209752
+    10_AAACGAAAGTCACACT-1   13320235        13320236
+    10_AAACGAAAGTCACACT-1   27036085        27036086
+
+    """
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    single_cell_approach = len(bam_filepaths) > 0
+    
+    suffix_pairs = [
+        (os.path.basename(bam).split("_")[0], 
+         os.path.basename(bam).split("_")[1].split(".")[0]) for bam in bam_filepaths
+    ]
+        
+    # Open file handles for each suffix
+    file_handles = {}
+    for prefix, suffix in suffix_pairs:
+        output_file = os.path.join(output_folder, f"combined_{output_suffix}_{prefix}_{suffix}.bed")
+        file_handles[prefix + suffix] = open(output_file, 'w')
+
+    try:
+        with open(input_bed_file, 'r') as infile:
+            for line in infile:
+                # Parse the first column to determine the suffix
+                columns = line.split()
+                
+                chrom = columns[0]  # Assuming the first column is the chromosome
+                for prefix, suffix in suffix_pairs:
+                    if chrom.startswith(f"{prefix}_") and chrom.endswith(suffix):
+                        file_handles[prefix + suffix].write(line)
+                        break
+
+    finally:
+        # Close all file handles
+        for handle in file_handles.values():
+            handle.close()
+            
 def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barcode_tag='CB', number_of_expected_bams=4):
     bam_indices_written = [f.split('/')[-1].split('.bam')[0] for f in glob('{}/*/*.sorted.bam.bai'.format(split_bams_folder))]
 
@@ -118,6 +226,68 @@ def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barc
     return contigs_to_write_bams_for
 
 
+def get_broken_up_contigs(contigs, num_per_sublist):
+    broken_up_contigs = []
+                
+    i_options = range((math.ceil(len(contigs)/num_per_sublist)) + 1)
+    
+    for i in i_options:
+        contig_sublist = []
+        j_options = range(i*num_per_sublist, (i*num_per_sublist) + num_per_sublist)
+        
+        for j in j_options:
+            if j < len(contigs):
+                contig_sublist.append(contigs[j])
+
+        if len(contig_sublist) > 0:
+            broken_up_contigs.append(contig_sublist)
+    return broken_up_contigs
+
+
+def pivot_edits_to_sparse(df, output_folder):
+    
+    # Create a new column for contig:position
+    df["CombinedPosition"] = df["contig"].astype(str) + ":" + df["position"].astype(str)
+
+    # Ensure the output directory exists
+    final_output_dir = os.path.join(output_folder, "final_matrix_outputs")
+    os.makedirs(final_output_dir, exist_ok=True)
+
+    print(f"Saving edit sparse matrices to {final_output_dir}")
+    
+    for strand_conversion in df.strand_conversion.unique():
+        print(f"\tProcessing strand_conversion: {strand_conversion}")
+
+        # Pivot the dataframe
+        pivoted_df = df[df.strand_conversion == strand_conversion].pivot(
+            index="CombinedPosition", 
+            columns="barcode", 
+            values="count"
+        )
+
+        # Replace NaN with 0 for missing values
+        pivoted_df = pivoted_df.fillna(0)
+
+        # Convert to a sparse matrix
+        sparse_matrix = sp.csr_matrix(pivoted_df.values)
+
+        # Create an AnnData object
+        adata = ad.AnnData(
+            X=sparse_matrix,
+            obs=pd.DataFrame(index=pivoted_df.index),  # Row (site) metadata
+            var=pd.DataFrame(index=pivoted_df.columns)  # Column (barcode) metadata
+        )
+
+        # Save the AnnData object
+        output_file_name = f"comprehensive_{strand_conversion.replace('>', '_')}_edits_matrix.h5ad"
+        output_file = os.path.join(
+            final_output_dir,
+            output_file_name
+        )
+        adata.write(output_file)
+        print(f"\t\tSaved sparse matrix for {strand_conversion} to {output_file_name}")
+
+
 def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):
     
     jobs = []
@@ -182,7 +352,30 @@ def read_barcode_whitelist_file(barcode_whitelist_file):
     pretty_print("Barcodes in whitelist: {}".format(len(barcode_whitelist)))
     return barcode_whitelist
 
+def print_all_cells_coverage_warning(all_cells_coverage, tabulation_bed):
+    if all_cells_coverage:
+        print("\n\nWill tabulate coverage across all cells... WARNING this can be extremely resource-consuming if there are a lot of cells and a lot of sites. Consider first filtering sites and then using the --tabulation_bed argument to specify the specific locations you would like tabulated across all cells.\n\n")
+        if tabulation_bed:
+            if os.path.exists(tabulation_bed):
+                print("\t...using sites in {}".format(tabulation_bed))
+            else:
+                print("{} does not exist! Exiting.".format(tabulation_bed))
+                sys.exit(1)
 
+def convert_conversions_argument(conversions, barcode_tag, file_type=None):
+    # Convert bedgraphs argument into list of conversions
+    if not conversions is None:
+        if barcode_tag in ['CB', 'IB']:
+            sys.stderr.write(f"Can only output {file_type} for bulk sequencing runs of MARINE")
+            sys.exit(1)
+            
+        conversions_list = conversions.upper().replace('I', 'G').split(',')
+        for b in conversions_list:
+            assert(b in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG'])
+    else:
+        conversions_list = []
+    return conversions_list 
+    
 def pretty_print(contents, style=''):
     if type(contents) == list:
         for item in contents:
@@ -195,7 +388,7 @@ def pretty_print(contents, style=''):
         before_line = None
         after_line = None
         
-        styled_line = ''.join([style for i in range(len(to_write))])
+        styled_line = ''.join([style for i in range(min(100, len(to_write)))])
         
         if style != '':
             # Line before
@@ -754,7 +947,7 @@ def merge_files_by_chromosome(args):
 
     # Use bash to execute the paste command
     run_command(f"bash -c '{paste_command}'")
-    print(f"Columnar merge complete for {chromosome}. Output saved to {merged_file}.")
+    print(f"\tColumnar merge complete for {chromosome}. Output saved to {merged_file}.")
 
 
 def prepare_matrix_files_multiprocess(output_matrix_folder, 
@@ -763,7 +956,7 @@ def prepare_matrix_files_multiprocess(output_matrix_folder,
     """
     Merges matrix files column-wise, grouping by chromosome, using multiprocessing.
     """
-    print("Merging matrices column-wise by chromosome...")
+    print("\n\nMerging matrices column-wise by chromosome...")
 
     # Group files by chromosome
     matrix_files = [
@@ -789,7 +982,7 @@ def prepare_matrix_files_multiprocess(output_matrix_folder,
     with Pool(processes=processes) as pool:
         pool.map(merge_files_by_chromosome, task_args)
 
-    print("All columnar merges complete.")
+    print("All columnar merges complete.\n")
 
 
 
@@ -813,7 +1006,7 @@ def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matri
 
     depths_file = output_filepath
     
-    print(f"Running samtools view on {bed_filepath} for {bam_filepath}, outputting to {output_filepath}")
+    print(f"\tRunning samtools view on {bed_filepath} for {bam_filepath}, outputting to {output_filepath}\n")
     
     regions = []
     with open(bed_filepath, "r") as bed:
@@ -823,7 +1016,7 @@ def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matri
             regions.append((chrom, start, end))
 
     if len(regions) > 0:
-        print(f"\t{len(regions)} regions")
+        print(f"\t{bed_filepath.split('/')[-1]}: {len(regions)} regions")
         
         with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out:
             try:
@@ -882,8 +1075,6 @@ def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='',
     if len(output_suffix) > 0:
         output_suffix = f"_{output_suffix}"
 
-    print("prepare_pysam_coverage_args, barcode_tag is {}".format(barcode_tag))
-
     for bam_filepath in bam_filepaths:
         # Extract suffix from BAM filename
         bam_filename = os.path.basename(bam_filepath)
@@ -909,7 +1100,17 @@ def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='',
     return args_list
 
 
-
+def check_folder_is_empty_warn_if_not(output_folder):
+    # Check to make sure the folder is empty, otherwise prompt for overwriting
+    if any(os.scandir(output_folder)):
+        file_info = []
+        for i in os.scandir(output_folder):
+            file_info.append('\tFile: {}'.format(i))
+            
+        pretty_print("WARNING: {} is not empty\n{}".format(output_folder,
+                                                           '\n'.join(file_info)
+                                                          ), style="^")
+        
 def make_depth_command_script_single_cell(paired_end, bam_filepaths, output_folder, all_depth_commands=[], 
                               output_suffix='', run=False, pivot=False, processes=4, barcode_tag=None):
     """
@@ -930,7 +1131,7 @@ def make_depth_command_script_single_cell(paired_end, bam_filepaths, output_fold
             f.write('{}\n\n'.format(d))
             
     if run:
-        print("Calculating depths using multiprocessing with pysam...")
+        pretty_print("\nCalculating depths using multiprocessing with pysam...", style='.')
         run_pysam_count_coverage(pysam_coverage_args, processes)
 
         merge_depth_files(output_folder, output_suffix)
diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb
index dcaa5f1..1e86212 100644
--- a/tests/integration_tests.ipynb
+++ b/tests/integration_tests.ipynb
@@ -2889,7 +2889,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387",
    "metadata": {},
    "outputs": [
@@ -2998,23 +2998,25 @@
       "\n",
       "\n",
       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
-      "Checking results for long_read_sc_test\n"
-     ]
-    },
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: 'singlecell_tests/long_read_sc_test/final_filtered_site_info_annotated.tsv'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[4], line 267\u001b[0m\n\u001b[1;32m    265\u001b[0m folder \u001b[38;5;241m=\u001b[39m info\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfolder\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    266\u001b[0m final_filtered_site_info_annotated \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m/final_filtered_site_info_annotated.tsv\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(folder, test_name)\n\u001b[0;32m--> 267\u001b[0m final_filtered_site_info_annotated_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_filtered_site_info_annotated\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;130;43;01m\\t\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal_edit_sites\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m info:\n\u001b[1;32m    270\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n",
-      "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m    899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    900\u001b[0m     dialect,\n\u001b[1;32m    901\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    908\u001b[0m     dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m    909\u001b[0m )\n\u001b[1;32m    910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m    580\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
-      "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1404\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1659\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1660\u001b[0m         mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1662\u001b[0m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1663\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1664\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1665\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1666\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1667\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1668\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1669\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1670\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n",
-      "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    855\u001b[0m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    856\u001b[0m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    857\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m    858\u001b[0m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m    860\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    861\u001b[0m \u001b[43m            \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    862\u001b[0m \u001b[43m            \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    863\u001b[0m \u001b[43m            \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    864\u001b[0m \u001b[43m            \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    865\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    866\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    867\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m    868\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'singlecell_tests/long_read_sc_test/final_filtered_site_info_annotated.tsv'"
+      "Checking results for long_read_sc_test\n",
+      "\tExpecting: {'contig': '6', 'barcode': 'ENSMUST00000203816-AACGTGTTGGAGAGGG-16-G', 'position': 115807969, 'num_rows': 1, 'count': 1, 'coverage': 1, 'conversion': 'A>C', 'strand_conversion': 'T>G', 'strand': '-', 'feature_name': 'Rpl32'}\n",
+      "\n",
+      "\t >>> long_read_sc_test passed! <<<\n",
+      "\n",
+      "\tExpecting: {'contig': '6', 'barcode': 'ENSMUST00000081840-AAGTCGTACCAGGCTC-40-C', 'position': 115805653, 'num_rows': 1, 'count': 1, 'coverage': 1, 'conversion': 'G>A', 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'Rpl32'}\n",
+      "\n",
+      "\t >>> long_read_sc_test passed! <<<\n",
+      "\n",
+      "\tExpecting: {'contig': '6', 'barcode': 'ENSMUST00000081840-AACGTGTTGGAGAGGG-40-G', 'position': 115807015, 'num_rows': 1, 'count': 1, 'coverage': 8, 'conversion': 'C>T', 'strand_conversion': 'G>A', 'strand': '-', 'feature_name': 'Rpl32'}\n",
+      "\n",
+      "\t >>> long_read_sc_test passed! <<<\n",
+      "\n",
+      "Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\n",
+      "grouped_sc_rows: 62, bulk_rows: 62\n",
+      "\n",
+      "\t >>> single-cell and bulk on same dataset comparison passed! <<<\n",
+      "\n",
+      "There were 0 failures\n"
      ]
     }
    ],