From 96273316de62bd0924835c39dacc95b24e46fa14 Mon Sep 17 00:00:00 2001 From: Eric Kofman Date: Fri, 20 Dec 2024 14:54:07 -0800 Subject: [PATCH] clean up code --- marine.py | 477 ++++------------------------------ src/core.py | 236 ++++++++++++++++- src/utils.py | 221 +++++++++++++++- tests/integration_tests.ipynb | 38 +-- 4 files changed, 511 insertions(+), 461 deletions(-) diff --git a/marine.py b/marine.py index c4e0d74..6f98f13 100755 --- a/marine.py +++ b/marine.py @@ -19,291 +19,23 @@ from matplotlib import pyplot as plt import math import shlex -import scipy.sparse as sp -import anndata as ad import os -# checkpoint - sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'src/')) -from read_process import incorporate_replaced_pos_info,incorporate_insertions_and_deletions,\ -get_positions_from_md_tag,reverse_complement,get_edit_information,get_edit_information_wrapper,\ -has_edits,get_total_coverage_for_contig_at_position,\ -print_read_info, get_read_information, get_hamming_distance, \ -remove_softclipped_bases,find - -from utils import get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \ -write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, \ -pretty_print, read_barcode_whitelist_file, get_contigs_that_need_bams_written, \ -make_depth_command_script_single_cell, generate_and_run_bash_merge, get_sailor_sites, \ -concatenate_files, run_command, get_edits_with_coverage_df, zero_edit_found, delete_intermediate_files +from utils import make_folder, pretty_print, make_depth_command_script_single_cell, \ +concatenate_files, get_edits_with_coverage_df, zero_edit_found, delete_intermediate_files, \ +pivot_edits_to_sparse, print_marine_logo, convert_sites_to_sailor, convert_conversions_argument, \ +generate_bedgraphs, check_folder_is_empty_warn_if_not, print_all_cells_coverage_warning -from core import run_edit_identifier, run_bam_reconfiguration, \ -gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information +from core import run_edit_identifier, run_bam_reconfiguration, run_edit_finding, \ +gather_edit_information_across_subcontigs, run_coverage_calculator, generate_site_level_information, \ +generate_depths from annotate import annotate_sites, get_strand_specific_conversion -def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], - verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000): - - pretty_print("Each contig is being split into subsets of length...".format(interval_length)) - - overall_label_to_list_of_contents, results, overall_time, overall_total_reads, \ - total_seconds_for_reads, counts_summary_df = run_edit_identifier( - bam_filepath, - output_folder, - strandedness=strandedness, - barcode_tag=barcode_tag, - barcode_whitelist=barcode_whitelist, - contigs=contigs, - verbose=verbose, - cores=cores, - min_read_quality=min_read_quality, - min_base_quality=min_base_quality, - dist_from_end=dist_from_end, - interval_length=interval_length - ) - - #print(overall_label_to_list_of_contents.keys()) - #print(overall_label_to_list_of_contents.get(list(overall_label_to_list_of_contents.keys())[0])) - - pretty_print( - [ - "Reads processed:\t{}".format(overall_total_reads), - "Time to process reads in min:\t{}".format(round(overall_time/60, 5)), - "Read Summary:\n{}".format(counts_summary_df) - ], - style="-" - ) - - - total_seconds_for_reads_df = pd.DataFrame.from_dict(total_seconds_for_reads, orient='index') - total_seconds_for_reads_df.columns = ['seconds'] - total_seconds_for_reads_df['reads'] = total_seconds_for_reads_df.index - total_seconds_for_reads_df.index = range(len(total_seconds_for_reads_df)) - - - return overall_label_to_list_of_contents, results, total_seconds_for_reads_df, overall_total_reads, counts_summary_df - - -def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag='CB', cores=1, number_of_expected_bams=4, - verbose=False): - # Only used for single-cell and/or long read reconfiguration of bams to optimize coverage calculation - split_bams_folder = '{}/split_bams'.format(output_folder) - make_folder(split_bams_folder) - contigs_to_generate_bams_for = get_contigs_that_need_bams_written(list(overall_label_to_list_of_contents.keys()), - split_bams_folder, - barcode_tag=barcode_tag, - number_of_expected_bams=number_of_expected_bams - ) - if verbose: - pretty_print("Will split and reconfigure the following contigs: {}".format(",".join(contigs_to_generate_bams_for))) - - - # BAM Generation - total_bam_generation_time, total_seconds_for_bams = run_bam_reconfiguration(split_bams_folder, bam_filepath, overall_label_to_list_of_contents, contigs_to_generate_bams_for, barcode_tag=barcode_tag, cores=cores, - number_of_expected_bams=number_of_expected_bams, - verbose=verbose) - - total_seconds_for_bams_df = pd.DataFrame.from_dict(total_seconds_for_bams, orient='index') - total_seconds_for_bams_df.columns = ['seconds'] - total_seconds_for_bams_df['contigs'] = total_seconds_for_bams_df.index - total_seconds_for_bams_df.index = range(len(total_seconds_for_bams_df)) - - return total_bam_generation_time, total_seconds_for_bams_df - - -def print_marine_logo(): - logo_lines = [ - ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: ", - "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: ", - "+:+ +:+:+ +:+ +:+ +:+ +:+ +:+ +:+ :+:+:+ +:+ +:+ ", - "+#+ +:+ +#+ +#++:++#++: +#++:++#: +#+ +#+ +:+ +#+ +#++:++# ", - "+#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+#+# +#+ ", - "#+# #+# #+# #+# #+# #+# #+# #+# #+#+# #+# ", - "### ### ### ### ### ### ########### ### #### ########## " - ] - for l in logo_lines: - pretty_print(l) - - pretty_print("Multi-core Algorithm for Rapid Identification of Nucleotide Edits", style="=") - - -def get_broken_up_contigs(contigs, num_per_sublist): - broken_up_contigs = [] - - i_options = range((math.ceil(len(contigs)/num_per_sublist)) + 1) - - for i in i_options: - contig_sublist = [] - j_options = range(i*num_per_sublist, (i*num_per_sublist) + num_per_sublist) - - for j in j_options: - if j < len(contigs): - contig_sublist.append(contigs[j]) - - if len(contig_sublist) > 0: - broken_up_contigs.append(contig_sublist) - return broken_up_contigs - -def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix=''): - """ - Split a BED file into multiple files based on suffixes in the first column. - Each line is assigned to the appropriate file based on the suffix. - - e.g.: - - 10_AAACGAAAGTCACACT-1 6143263 6143264 - 10_AAACGAAAGTCACACT-1 11912575 11912576 - 10_AAACGAAAGTCACACT-1 12209751 12209752 - 10_AAACGAAAGTCACACT-1 13320235 13320236 - 10_AAACGAAAGTCACACT-1 27036085 27036086 - - """ - if not os.path.exists(output_folder): - os.makedirs(output_folder) - - single_cell_approach = len(bam_filepaths) > 0 - - suffix_pairs = [ - (os.path.basename(bam).split("_")[0], - os.path.basename(bam).split("_")[1].split(".")[0]) for bam in bam_filepaths - ] - - # Open file handles for each suffix - file_handles = {} - for prefix, suffix in suffix_pairs: - output_file = os.path.join(output_folder, f"combined_{output_suffix}_{prefix}_{suffix}.bed") - file_handles[prefix + suffix] = open(output_file, 'w') - - try: - with open(input_bed_file, 'r') as infile: - for line in infile: - # Parse the first column to determine the suffix - columns = line.split() - - chrom = columns[0] # Assuming the first column is the chromosome - for prefix, suffix in suffix_pairs: - if chrom.startswith(f"{prefix}_") and chrom.endswith(suffix): - file_handles[prefix + suffix].write(line) - break - - finally: - # Close all file handles - for handle in file_handles.values(): - handle.close() - - -def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=None): - - coverage_start_time = time.perf_counter() - - all_depth_commands = [] - - combine_edit_sites_command = ( - "echo 'concatenating bed file...';" - "for file in {}/edit_info/*edit_info.tsv; do " - "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; " - "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;" - ).format(output_folder, output_folder) - - if not os.path.exists(f'{output_folder}/combined_source_cells.bed'): - run_command(combine_edit_sites_command) - - all_depth_commands.append(combine_edit_sites_command) - - output_suffix = 'source_cells' - - if barcode_tag: - coverage_subfolder = '{}/coverage'.format(output_folder) - make_folder(coverage_subfolder) - - # Single cell mode - split_bed_file( - f"{output_folder}/combined_{output_suffix}.bed", - f"{output_folder}/combined_{output_suffix}_split_by_suffix", - bam_filepaths, - output_suffix=output_suffix - ) - - make_depth_command_script_single_cell(paired_end, bam_filepaths, output_folder, - all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores, barcode_tag=barcode_tag) - - else: - if paired_end: - paired_end_flag = '-s ' - else: - paired_end_flag = '' - - # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed - samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt" - run_command(samtools_depth_command) - - - print("Concatenating edit info files...") - concatenate_files(output_folder, "edit_info/*edit_info.tsv", - "{}/final_edit_info_no_coverage.tsv".format(output_folder), - run=True) - - print("Append the depth columns to the concatenated final_edit_info file...") - - header_columns = ['barcode', 'contig', 'position', 'ref', 'alt', - 'read_id', 'strand', 'coverage'] - - - generate_and_run_bash_merge(output_folder, - '{}/final_edit_info_no_coverage.tsv'.format(output_folder), - '{}/depths_source_cells.txt'.format(output_folder), - '{}/final_edit_info.tsv'.format(output_folder), - header_columns, barcode_tag=barcode_tag) - - coverage_total_time = time.perf_counter() - coverage_start_time - - total_seconds_for_contig_df = pd.DataFrame({'coverage_total_time': [coverage_total_time]}) - return coverage_total_time, total_seconds_for_contig_df - - -def convert_sites_to_sailor(final_site_level_information_df, sailor_list, output_folder, skip_coverage): - # Output SAILOR-formatted file for use in FLARE downstream - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # 1 629275 629276 0.966040688 2,30 + - # 1 629309 629310 2.8306e-05 1,1043 + - - for conversion in sailor_list: - conversion_search = conversion[0] + '>' + conversion[1] - - print("Generating SAILOR-style bed outputs for conversion {}...".format(conversion)) - - sailor_sites,weird_sites = get_sailor_sites(final_site_level_information_df, conversion_search, skip_coverage=skip_coverage) - sailor_sites = sailor_sites.drop_duplicates() - - print("{} final deduplicated {} SAILOR-formatted sites".format(len(sailor_sites), conversion_search)) - sailor_sites.to_csv('{}/sailor_style_sites_{}.bed'.format( - output_folder, - conversion_search.replace(">", "-")), - header=False, - index=False, - sep='\t') - - -def generate_bedgraphs(final_site_level_information_df, conversion_search, output_folder): - bedgraph_folder = '{}/bedgraphs'.format(output_folder) - make_folder(bedgraph_folder) - - pretty_print("Making bedgraphs for {} conversions...\n".format(bedgraphs_list)) - for conversion in bedgraphs_list: - conversion_search = conversion[0] + '>' + conversion[1] - sites_for_conversion = final_site_level_information_df[final_site_level_information_df.conversion == conversion_search] - sites_for_conversion['edit_fraction'] = sites_for_conversion['count']/sites_for_conversion['coverage'] - sites_for_conversion['start'] = sites_for_conversion['position'] - 1 - sites_for_conversion_bedgraph_cols = sites_for_conversion[['contig', 'start', 'position', 'edit_fraction']] - - sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False) - - def prepare_combinations_for_split(df, bam_filepaths, output_folder, output_suffix): """ Prepares the chromosome-suffix combinations for multiprocessing. @@ -376,46 +108,6 @@ def process_combination_for_split(args): print(f"\t\t\t>>> Processed {chrom}, {prefix}_{suffix} -> {output_file}") -def pivot_edits_to_sparse(df, output_folder): - # Create a new column for contig:position - df["CombinedPosition"] = df["contig"].astype(str) + ":" + df["position"].astype(str) - - # Ensure the output directory exists - final_output_dir = os.path.join(output_folder, "final_matrix_outputs") - os.makedirs(final_output_dir, exist_ok=True) - - for strand_conversion in df.strand_conversion.unique(): - print(f"Processing strand_conversion: {strand_conversion}") - - # Pivot the dataframe - pivoted_df = df[df.strand_conversion == strand_conversion].pivot( - index="CombinedPosition", - columns="barcode", - values="count" - ) - - # Replace NaN with 0 for missing values - pivoted_df = pivoted_df.fillna(0) - - # Convert to a sparse matrix - sparse_matrix = sp.csr_matrix(pivoted_df.values) - - # Create an AnnData object - adata = ad.AnnData( - X=sparse_matrix, - obs=pd.DataFrame(index=pivoted_df.index), # Row (site) metadata - var=pd.DataFrame(index=pivoted_df.columns) # Column (barcode) metadata - ) - - # Save the AnnData object - output_file = os.path.join( - final_output_dir, - f"comprehensive_{strand_conversion.replace('>', '_')}_edits_matrix.h5ad" - ) - adata.write(output_file) - print(f"Saved sparse matrix for {strand_conversion} to {output_file}") - - def generate_and_split_bed_files_for_all_edits(output_folder, bam_filepaths, tabulation_bed=None, processes=4, output_suffix="all_cells"): """ Generates combined BED files for all edit sites and splits them into suffix-specific files. @@ -438,9 +130,12 @@ def generate_and_split_bed_files_for_all_edits(output_folder, bam_filepaths, tab tabulation_bed_df['contig_position'] = tabulation_bed_df['chrom'].astype(str) + '_' + tabulation_bed_df['start'].astype(str) print(f"\t{len(tabulation_bed_df)} unique positions in {tabulation_bed}...") print(tabulation_bed_df.head()) + + valid_positions = set(df['contig_position']) + positions_to_keep = valid_positions.intersection(set(tabulation_bed_df.contig_position)) - df = df[df['contig_position'].isin(set(tabulation_bed_df.contig_position))] - print(f"\tRunning {len(df)} positions through all-cell coverage tabulation...") + print(f"\t{len(positions_to_keep)} out of {len(valid_positions)} specified positions in {tabulation_bed} are valid") + df = df[df['contig_position'].isin(positions_to_keep)] # Pivot edit dataframe print("Pivoting edits dataframe into sparse h5ad files...") @@ -465,8 +160,7 @@ def generate_and_split_bed_files_for_all_edits(output_folder, bam_filepaths, tab pool.map(process_combination_for_split, combinations) print(f"All split BED files generated in {output_folder}/combined_{output_suffix}_split_by_suffix") - - + def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4, keep_intermediate_files=False, @@ -475,10 +169,6 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand all_cells_coverage=False, tabulation_bed=None ): - # Check to make sure the folder is empty, otherwise prompt for overwriting - if any(os.scandir(output_folder)): - pretty_print("WARNING: {} is not empty".format(output_folder), style="^") - logging_folder = "{}/metadata".format(output_folder) with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: @@ -505,86 +195,33 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand final_path_already_exists = False final_annotated_path_already_exists = False + # Edit identification + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pretty_print("Identifying edits", style="~") + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if os.path.exists(final_filtered_sites_path): print("{} exists... skipping edit finding.".format(final_filtered_sites_path)) final_path_already_exists = True - - # Edit finding - if not (coverage_only or filtering_only) and not final_path_already_exists: - overall_total_reads_processed = 0 - if barcode_whitelist_file: - barcode_whitelist = read_barcode_whitelist_file(barcode_whitelist_file) - else: - barcode_whitelist = None - - # Edit identification - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pretty_print("Identifying edits", style="~") - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - if len(contigs) == 0: - # Take care of the case where no contigs are specified, so that all contigs available are processed - broken_up_contigs = [[]] - else: - if barcode_tag: - # For single cell sequencing we will only process this many contigs at a time - broken_up_contigs = get_broken_up_contigs(contigs, num_per_sublist) - - else: - # For bulk sequencing we will just process all contigs - broken_up_contigs = [contigs] - - print('Contig groups to be processed:', broken_up_contigs) - - overall_counts_summary_df = defaultdict(lambda:0) - overall_total_reads_processed = 0 - for subcontig_list in broken_up_contigs: - - overall_label_to_list_of_contents, results, total_seconds_for_reads_df, total_reads_processed, counts_summary_df = edit_finder( + else: + if not (coverage_only or filtering_only): + run_edit_finding( + barcode_tag, + barcode_whitelist_file, + contigs, + num_per_sublist, bam_filepath, output_folder, strandedness, - barcode_tag, - barcode_whitelist, - subcontig_list, - verbose, - cores=cores, - min_read_quality=min_read_quality, - min_base_quality=min_base_quality, - dist_from_end=min_dist_from_end, - interval_length=interval_length + min_read_quality, + min_base_quality, + min_dist_from_end, + interval_length, + number_of_expected_bams, + cores, + logging_folder, + verbose ) - - for k,v in counts_summary_df.items(): - overall_counts_summary_df[k] += v - - overall_total_reads_processed += total_reads_processed - #total_seconds_for_reads_df.to_csv("{}/edit_finder_timing.tsv".format(logging_folder), sep='\t') - - if barcode_tag: - # Make a subfolder into which the split bams will be placed - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pretty_print("Contigs processed\n\n\t{}".format(sorted(list(overall_label_to_list_of_contents.keys())))) - pretty_print("Splitting and reconfiguring BAMs to optimize coverage calculations", style="~") - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - total_bam_generation_time, total_seconds_for_bams_df = bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag=barcode_tag, cores=cores, number_of_expected_bams=number_of_expected_bams, verbose=verbose) - #total_seconds_for_bams_df.to_csv("{}/bam_reconfiguration_timing.tsv".format(logging_folder), sep='\t') - pretty_print("Total time to concat and write bams: {} minutes".format(round(total_bam_generation_time/60, 3))) - - print("Deleting overall_label_to_list_of_contents...") - del overall_label_to_list_of_contents - - - with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: - f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') - for k, v in overall_counts_summary_df.items(): - f.write(f'{k}\t{v}\n') - - f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n') - reconfigured_bam_filepaths = glob('{}/split_bams/*/*.bam'.format(output_folder)) if not final_path_already_exists and not skip_coverage: @@ -594,7 +231,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand # We want to run the samtools depth command for each of the reconfigured bam files print("Running samtools depth on {} subset bam paths...".format(len(reconfigured_bam_filepaths))) - total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end, barcode_tag=barcode_tag) + total_time, total_seconds_for_contig_df = generate_depths(output_folder, reconfigured_bam_filepaths, paired_end=paired_end, barcode_tag=barcode_tag, cores=cores) total_seconds_for_contig_df.to_csv("{}/coverage_calculation_timing.tsv".format(logging_folder), sep='\t') @@ -651,6 +288,8 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand # Annotation option if final_path_already_exists and annotation_bedfile_path: + pretty_print("Annotating edits...", style="~") + final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t') final_site_level_information_annotated_df = annotate_sites(final_site_level_information_df, @@ -685,8 +324,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand print(f'Time elapsed: {time.time()-start_time:.2f}s') if final_path_already_exists and all_cells_coverage: + # Coverage across all cells + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pretty_print("Calculating coverage across all cells at all positions...", style="~") + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ output_suffix = "all_cells" - print("Calculating coverage at all edit sites in all cells...") # Get the list of BAM file paths bam_filepaths = glob(f"{output_folder}/split_bams/*/*.bam") @@ -802,51 +444,26 @@ def check_samtools(): interval_length = args.interval_length num_per_sublist = args.num_per_sublist + print_marine_logo() + check_folder_is_empty_warn_if_not(output_folder) # all_cells_coverage only applies for single cell case if not barcode_tag: if all_cells_coverage == True: + print("WARNING: --all_cells_coverage flag only applies for single cell case, ignoring...") all_cells_coverage = False - if all_cells_coverage: - print("\n\nWill tabulate coverage across all cells... WARNING this can be extremely resource-consuming if there are a lot of cells and a lot of sites. Consider first filtering sites and then using the --tabulation_bed argument to specify the specific locations you would like tabulated across all cells.\n\n") - if tabulation_bed: - if os.path.exists(tabulation_bed): - print("\t...using sites in {}".format(tabulation_bed)) - else: - print("{} does not exist! Exiting.".format(tabulation_bed)) - sys.exit(1) - - # Convert bedgraphs argument into list of conversions - if not bedgraphs is None: - if barcode_tag in ['CB', 'IB']: - sys.stderr.write("Can only output bedgraphs for bulk sequencing runs of MARINE") - sys.exit(1) - - bedgraphs_list = bedgraphs.upper().replace('I', 'G').split(',') - for b in bedgraphs_list: - assert(b in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG']) - else: - bedgraphs_list = [] + print_all_cells_coverage_warning(all_cells_coverage, tabulation_bed) - if not sailor is None: - if barcode_tag in ['CB', 'IB']: - sys.stderr.write("Can only output sailor for bulk sequencing runs of MARINE") - sys.exit(1) - - sailor_list = sailor.upper().replace('I', 'G').split(',') - for s in sailor_list: - assert(s in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG']) - else: - sailor_list = [] - + bedgraphs_list = convert_conversions_argument(bedgraphs, barcode_tag, file_type='bedgraph') + sailor_list = convert_conversions_argument(bedgraphs, barcode_tag, file_type='sailor') + assert(strandedness in [0, 1, 2]) if not os.path.exists(output_folder): pretty_print("{} (output folder) does not exist, making folder...".format(output_folder)) os.mkdir(output_folder) - # Get the exact command line used to run this script command_line = " ".join(shlex.quote(arg) for arg in sys.argv) print('command: {}'.format(command_line)) @@ -866,8 +483,6 @@ def check_samtools(): assert(not(coverage_only and filtering_only)) - print_marine_logo() - pretty_print(["Arguments:", "\tBAM filepath:\t{}".format(bam_filepath), "\tAnnotation bedfile filepath:\t{}".format(annotation_bedfile_path), diff --git a/src/core.py b/src/core.py index 8b1afd2..579cd32 100644 --- a/src/core.py +++ b/src/core.py @@ -18,11 +18,157 @@ print_read_info, get_read_information, get_hamming_distance, remove_softclipped_bases,find from utils import get_contig_lengths_dict, get_intervals, index_bam, write_rows_to_info_file, write_header_to_edit_info, \ -write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, make_edit_finding_jobs, pretty_print,\ -get_coverage_wrapper, write_reads_to_file, sort_bam, rm_bam, suffixes +write_read_to_bam_file, remove_file_if_exists, make_folder, concat_and_write_bams_wrapper, make_edit_finding_jobs, pretty_print, get_contigs_that_need_bams_written, split_bed_file, \ +get_coverage_wrapper, write_reads_to_file, sort_bam, rm_bam, suffixes, get_broken_up_contigs, run_command, \ +make_depth_command_script_single_cell, concatenate_files, generate_and_run_bash_merge import os, psutil + +def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=None, cores=1): + + coverage_start_time = time.perf_counter() + + all_depth_commands = [] + + combine_edit_sites_command = ( + "echo 'concatenating bed file...';" + "for file in {}/edit_info/*edit_info.tsv; do " + "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; " + "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;" + ).format(output_folder, output_folder) + + if not os.path.exists(f'{output_folder}/combined_source_cells.bed'): + run_command(combine_edit_sites_command) + + all_depth_commands.append(combine_edit_sites_command) + + output_suffix = 'source_cells' + + if barcode_tag: + coverage_subfolder = '{}/coverage'.format(output_folder) + make_folder(coverage_subfolder) + + # Single cell mode + split_bed_file( + f"{output_folder}/combined_{output_suffix}.bed", + f"{output_folder}/combined_{output_suffix}_split_by_suffix", + bam_filepaths, + output_suffix=output_suffix + ) + + make_depth_command_script_single_cell(paired_end, bam_filepaths, output_folder, + all_depth_commands=all_depth_commands, output_suffix='source_cells', run=True, processes=cores, barcode_tag=barcode_tag) + + else: + if paired_end: + paired_end_flag = '-s ' + else: + paired_end_flag = '' + + # Bulk mode, we will not split the bed and simply use samtools depth on the combined.bed + samtools_depth_command = f"samtools depth {paired_end_flag}-a -b {output_folder}/combined_source_cells.bed {bam_filepath} > {output_folder}/depths_source_cells.txt" + run_command(samtools_depth_command) + + + print("Concatenating edit info files...") + concatenate_files(output_folder, "edit_info/*edit_info.tsv", + "{}/final_edit_info_no_coverage.tsv".format(output_folder), + run=True) + + print("Append the depth columns to the concatenated final_edit_info file...") + + header_columns = ['barcode', 'contig', 'position', 'ref', 'alt', + 'read_id', 'strand', 'coverage'] + + + generate_and_run_bash_merge(output_folder, + '{}/final_edit_info_no_coverage.tsv'.format(output_folder), + '{}/depths_source_cells.txt'.format(output_folder), + '{}/final_edit_info.tsv'.format(output_folder), + header_columns, barcode_tag=barcode_tag) + + coverage_total_time = time.perf_counter() - coverage_start_time + + total_seconds_for_contig_df = pd.DataFrame({'coverage_total_time': [coverage_total_time]}) + return coverage_total_time, total_seconds_for_contig_df + + +def bam_processing(bam_filepath, overall_label_to_list_of_contents, output_folder, barcode_tag='CB', cores=1, number_of_expected_bams=4, + verbose=False): + # Only used for single-cell and/or long read reconfiguration of bams to optimize coverage calculation + split_bams_folder = '{}/split_bams'.format(output_folder) + make_folder(split_bams_folder) + contigs_to_generate_bams_for = get_contigs_that_need_bams_written(list(overall_label_to_list_of_contents.keys()), + split_bams_folder, + barcode_tag=barcode_tag, + number_of_expected_bams=number_of_expected_bams + ) + if verbose: + pretty_print("Will split and reconfigure the following contigs: {}".format(",".join(contigs_to_generate_bams_for))) + + + # BAM Generation + total_bam_generation_time, total_seconds_for_bams = run_bam_reconfiguration( + split_bams_folder, + bam_filepath, + overall_label_to_list_of_contents, + contigs_to_generate_bams_for, + barcode_tag=barcode_tag, + cores=cores, + number_of_expected_bams=number_of_expected_bams, + verbose=verbose) + + total_seconds_for_bams_df = pd.DataFrame.from_dict(total_seconds_for_bams, orient='index') + total_seconds_for_bams_df.columns = ['seconds'] + total_seconds_for_bams_df['contigs'] = total_seconds_for_bams_df.index + total_seconds_for_bams_df.index = range(len(total_seconds_for_bams_df)) + + return total_bam_generation_time, total_seconds_for_bams_df + + +def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], + verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000): + + pretty_print("Each contig is being split into subsets of length...".format(interval_length)) + + overall_label_to_list_of_contents, results, overall_time, overall_total_reads, \ + total_seconds_for_reads, counts_summary_df = run_edit_identifier( + bam_filepath, + output_folder, + strandedness=strandedness, + barcode_tag=barcode_tag, + barcode_whitelist=barcode_whitelist, + contigs=contigs, + verbose=verbose, + cores=cores, + min_read_quality=min_read_quality, + min_base_quality=min_base_quality, + dist_from_end=dist_from_end, + interval_length=interval_length + ) + + #print(overall_label_to_list_of_contents.keys()) + #print(overall_label_to_list_of_contents.get(list(overall_label_to_list_of_contents.keys())[0])) + + pretty_print( + [ + "Reads processed:\t{}".format(overall_total_reads), + "Time to process reads in min:\t{}".format(round(overall_time/60, 5)), + "Read Summary:\n{}".format(counts_summary_df) + ], + style="-" + ) + + + total_seconds_for_reads_df = pd.DataFrame.from_dict(total_seconds_for_reads, orient='index') + total_seconds_for_reads_df.columns = ['seconds'] + total_seconds_for_reads_df['reads'] = total_seconds_for_reads_df.index + total_seconds_for_reads_df.index = range(len(total_seconds_for_reads_df)) + + + return overall_label_to_list_of_contents, results, total_seconds_for_reads_df, overall_total_reads, counts_summary_df + def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000): # Make subfolder in which to information about edits @@ -110,6 +256,92 @@ def run_bam_reconfiguration(split_bams_folder, bampath, overall_label_to_list_of return total_bam_generation_time, total_seconds_for_bams +def run_edit_finding(barcode_tag, + barcode_whitelist_file, + contigs, + num_per_sublist, + bam_filepath, + output_folder, + strandedness, + min_read_quality, + min_base_quality, + min_dist_from_end, + interval_length, + number_of_expected_bams, + cores, + logging_folder, + verbose=False + ): + overall_total_reads_processed = 0 + if barcode_whitelist_file: + barcode_whitelist = read_barcode_whitelist_file(barcode_whitelist_file) + else: + barcode_whitelist = None + + if len(contigs) == 0: + # Take care of the case where no contigs are specified, so that all contigs available are processed + broken_up_contigs = [[]] + else: + if barcode_tag: + # For single cell sequencing we will only process this many contigs at a time + broken_up_contigs = get_broken_up_contigs(contigs, num_per_sublist) + + else: + # For bulk sequencing we will just process all contigs + broken_up_contigs = [contigs] + + print('Contig groups to be processed:', broken_up_contigs) + + overall_counts_summary_df = defaultdict(lambda:0) + overall_total_reads_processed = 0 + for subcontig_list in broken_up_contigs: + + overall_label_to_list_of_contents, results, total_seconds_for_reads_df, total_reads_processed, counts_summary_df = edit_finder( + bam_filepath, + output_folder, + strandedness, + barcode_tag, + barcode_whitelist, + subcontig_list, + verbose, + cores=cores, + min_read_quality=min_read_quality, + min_base_quality=min_base_quality, + dist_from_end=min_dist_from_end, + interval_length=interval_length + ) + + for k,v in counts_summary_df.items(): + overall_counts_summary_df[k] += v + + overall_total_reads_processed += total_reads_processed + + #total_seconds_for_reads_df.to_csv("{}/edit_finder_timing.tsv".format(logging_folder), sep='\t') + + if barcode_tag: + # Make a subfolder into which the split bams will be placed + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pretty_print("Contigs processed\n\n\t{}".format(sorted(list(overall_label_to_list_of_contents.keys())))) + pretty_print("Splitting and reconfiguring BAMs to optimize coverage calculations", style="~") + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + + total_bam_generation_time, total_seconds_for_bams_df = bam_processing(bam_filepath, overall_label_to_list_of_contents, output_folder, barcode_tag=barcode_tag, cores=cores, number_of_expected_bams=number_of_expected_bams, verbose=verbose) + #total_seconds_for_bams_df.to_csv("{}/bam_reconfiguration_timing.tsv".format(logging_folder), sep='\t') + pretty_print("Total time to concat and write bams: {} minutes".format(round(total_bam_generation_time/60, 3))) + + print("Deleting overall_label_to_list_of_contents...") + del overall_label_to_list_of_contents + + + with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: + f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') + for k, v in overall_counts_summary_df.items(): + f.write(f'{k}\t{v}\n') + + f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n') + + def incorporate_barcode(read_as_string, contig, barcode): read_tab_separated = read_as_string.split('\t') contig_section = '{}_{}'.format(contig, barcode) diff --git a/src/utils.py b/src/utils.py index fdb9cd1..76c9f83 100644 --- a/src/utils.py +++ b/src/utils.py @@ -14,9 +14,29 @@ from multiprocessing import Pool import multiprocessing import time +import scipy.sparse as sp +import anndata as ad +# Number of barcode characters to use as suffix during splitting CB_N = 1 +def print_marine_logo(): + logo_lines = [ + ":::: :::: ::: ::::::::: ::::::::::: :::: ::: :::::::::: ", + "+:+:+: :+:+:+ :+: :+: :+: :+: :+: :+:+: :+: :+: ", + "+:+ +:+:+ +:+ +:+ +:+ +:+ +:+ +:+ :+:+:+ +:+ +:+ ", + "+#+ +:+ +#+ +#++:++#++: +#++:++#: +#+ +#+ +:+ +#+ +#++:++# ", + "+#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+#+# +#+ ", + "#+# #+# #+# #+# #+# #+# #+# #+# #+#+# #+# ", + "### ### ### ### ### ### ########### ### #### ########## " + ] + for l in logo_lines: + pretty_print(l) + + pretty_print("Multi-core Algorithm for Rapid Identification of Nucleotide Edits", style="=") + + + def generate_permutations_list_for_CB(n): """ Generate all permutations of A, C, G, T for strings of length n @@ -95,6 +115,94 @@ def generate_permutations_list_for_CB(n): ] } + +def generate_bedgraphs(final_site_level_information_df, conversion_search, output_folder): + bedgraph_folder = '{}/bedgraphs'.format(output_folder) + make_folder(bedgraph_folder) + + pretty_print("Making bedgraphs for {} conversions...\n".format(bedgraphs_list)) + for conversion in bedgraphs_list: + conversion_search = conversion[0] + '>' + conversion[1] + sites_for_conversion = final_site_level_information_df[final_site_level_information_df.conversion == conversion_search] + sites_for_conversion['edit_fraction'] = sites_for_conversion['count']/sites_for_conversion['coverage'] + sites_for_conversion['start'] = sites_for_conversion['position'] - 1 + sites_for_conversion_bedgraph_cols = sites_for_conversion[['contig', 'start', 'position', 'edit_fraction']] + + sites_for_conversion_bedgraph_cols.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False) + + + +def convert_sites_to_sailor(final_site_level_information_df, sailor_list, output_folder, skip_coverage): + # Output SAILOR-formatted file for use in FLARE downstream + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # 1 629275 629276 0.966040688 2,30 + + # 1 629309 629310 2.8306e-05 1,1043 + + + for conversion in sailor_list: + conversion_search = conversion[0] + '>' + conversion[1] + + print("Generating SAILOR-style bed outputs for conversion {}...".format(conversion)) + + sailor_sites,weird_sites = get_sailor_sites(final_site_level_information_df, conversion_search, skip_coverage=skip_coverage) + sailor_sites = sailor_sites.drop_duplicates() + + print("{} final deduplicated {} SAILOR-formatted sites".format(len(sailor_sites), conversion_search)) + sailor_sites.to_csv('{}/sailor_style_sites_{}.bed'.format( + output_folder, + conversion_search.replace(">", "-")), + header=False, + index=False, + sep='\t') + + + +def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix=''): + """ + Split a BED file into multiple files based on suffixes in the first column. + Each line is assigned to the appropriate file based on the suffix. + + e.g.: + + 10_AAACGAAAGTCACACT-1 6143263 6143264 + 10_AAACGAAAGTCACACT-1 11912575 11912576 + 10_AAACGAAAGTCACACT-1 12209751 12209752 + 10_AAACGAAAGTCACACT-1 13320235 13320236 + 10_AAACGAAAGTCACACT-1 27036085 27036086 + + """ + if not os.path.exists(output_folder): + os.makedirs(output_folder) + + single_cell_approach = len(bam_filepaths) > 0 + + suffix_pairs = [ + (os.path.basename(bam).split("_")[0], + os.path.basename(bam).split("_")[1].split(".")[0]) for bam in bam_filepaths + ] + + # Open file handles for each suffix + file_handles = {} + for prefix, suffix in suffix_pairs: + output_file = os.path.join(output_folder, f"combined_{output_suffix}_{prefix}_{suffix}.bed") + file_handles[prefix + suffix] = open(output_file, 'w') + + try: + with open(input_bed_file, 'r') as infile: + for line in infile: + # Parse the first column to determine the suffix + columns = line.split() + + chrom = columns[0] # Assuming the first column is the chromosome + for prefix, suffix in suffix_pairs: + if chrom.startswith(f"{prefix}_") and chrom.endswith(suffix): + file_handles[prefix + suffix].write(line) + break + + finally: + # Close all file handles + for handle in file_handles.values(): + handle.close() + def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barcode_tag='CB', number_of_expected_bams=4): bam_indices_written = [f.split('/')[-1].split('.bam')[0] for f in glob('{}/*/*.sorted.bam.bai'.format(split_bams_folder))] @@ -118,6 +226,68 @@ def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barc return contigs_to_write_bams_for +def get_broken_up_contigs(contigs, num_per_sublist): + broken_up_contigs = [] + + i_options = range((math.ceil(len(contigs)/num_per_sublist)) + 1) + + for i in i_options: + contig_sublist = [] + j_options = range(i*num_per_sublist, (i*num_per_sublist) + num_per_sublist) + + for j in j_options: + if j < len(contigs): + contig_sublist.append(contigs[j]) + + if len(contig_sublist) > 0: + broken_up_contigs.append(contig_sublist) + return broken_up_contigs + + +def pivot_edits_to_sparse(df, output_folder): + + # Create a new column for contig:position + df["CombinedPosition"] = df["contig"].astype(str) + ":" + df["position"].astype(str) + + # Ensure the output directory exists + final_output_dir = os.path.join(output_folder, "final_matrix_outputs") + os.makedirs(final_output_dir, exist_ok=True) + + print(f"Saving edit sparse matrices to {final_output_dir}") + + for strand_conversion in df.strand_conversion.unique(): + print(f"\tProcessing strand_conversion: {strand_conversion}") + + # Pivot the dataframe + pivoted_df = df[df.strand_conversion == strand_conversion].pivot( + index="CombinedPosition", + columns="barcode", + values="count" + ) + + # Replace NaN with 0 for missing values + pivoted_df = pivoted_df.fillna(0) + + # Convert to a sparse matrix + sparse_matrix = sp.csr_matrix(pivoted_df.values) + + # Create an AnnData object + adata = ad.AnnData( + X=sparse_matrix, + obs=pd.DataFrame(index=pivoted_df.index), # Row (site) metadata + var=pd.DataFrame(index=pivoted_df.columns) # Column (barcode) metadata + ) + + # Save the AnnData object + output_file_name = f"comprehensive_{strand_conversion.replace('>', '_')}_edits_matrix.h5ad" + output_file = os.path.join( + final_output_dir, + output_file_name + ) + adata.write(output_file) + print(f"\t\tSaved sparse matrix for {strand_conversion} to {output_file_name}") + + def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000): jobs = [] @@ -182,7 +352,30 @@ def read_barcode_whitelist_file(barcode_whitelist_file): pretty_print("Barcodes in whitelist: {}".format(len(barcode_whitelist))) return barcode_whitelist +def print_all_cells_coverage_warning(all_cells_coverage, tabulation_bed): + if all_cells_coverage: + print("\n\nWill tabulate coverage across all cells... WARNING this can be extremely resource-consuming if there are a lot of cells and a lot of sites. Consider first filtering sites and then using the --tabulation_bed argument to specify the specific locations you would like tabulated across all cells.\n\n") + if tabulation_bed: + if os.path.exists(tabulation_bed): + print("\t...using sites in {}".format(tabulation_bed)) + else: + print("{} does not exist! Exiting.".format(tabulation_bed)) + sys.exit(1) +def convert_conversions_argument(conversions, barcode_tag, file_type=None): + # Convert bedgraphs argument into list of conversions + if not conversions is None: + if barcode_tag in ['CB', 'IB']: + sys.stderr.write(f"Can only output {file_type} for bulk sequencing runs of MARINE") + sys.exit(1) + + conversions_list = conversions.upper().replace('I', 'G').split(',') + for b in conversions_list: + assert(b in ['AC', 'AG', 'AT', 'CA', 'CG', 'CT', 'GA', 'GC', 'GT', 'TA', 'TC', 'TG']) + else: + conversions_list = [] + return conversions_list + def pretty_print(contents, style=''): if type(contents) == list: for item in contents: @@ -195,7 +388,7 @@ def pretty_print(contents, style=''): before_line = None after_line = None - styled_line = ''.join([style for i in range(len(to_write))]) + styled_line = ''.join([style for i in range(min(100, len(to_write)))]) if style != '': # Line before @@ -754,7 +947,7 @@ def merge_files_by_chromosome(args): # Use bash to execute the paste command run_command(f"bash -c '{paste_command}'") - print(f"Columnar merge complete for {chromosome}. Output saved to {merged_file}.") + print(f"\tColumnar merge complete for {chromosome}. Output saved to {merged_file}.") def prepare_matrix_files_multiprocess(output_matrix_folder, @@ -763,7 +956,7 @@ def prepare_matrix_files_multiprocess(output_matrix_folder, """ Merges matrix files column-wise, grouping by chromosome, using multiprocessing. """ - print("Merging matrices column-wise by chromosome...") + print("\n\nMerging matrices column-wise by chromosome...") # Group files by chromosome matrix_files = [ @@ -789,7 +982,7 @@ def prepare_matrix_files_multiprocess(output_matrix_folder, with Pool(processes=processes) as pool: pool.map(merge_files_by_chromosome, task_args) - print("All columnar merges complete.") + print("All columnar merges complete.\n") @@ -813,7 +1006,7 @@ def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matri depths_file = output_filepath - print(f"Running samtools view on {bed_filepath} for {bam_filepath}, outputting to {output_filepath}") + print(f"\tRunning samtools view on {bed_filepath} for {bam_filepath}, outputting to {output_filepath}\n") regions = [] with open(bed_filepath, "r") as bed: @@ -823,7 +1016,7 @@ def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matri regions.append((chrom, start, end)) if len(regions) > 0: - print(f"\t{len(regions)} regions") + print(f"\t{bed_filepath.split('/')[-1]}: {len(regions)} regions") with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out: try: @@ -882,8 +1075,6 @@ def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='', if len(output_suffix) > 0: output_suffix = f"_{output_suffix}" - print("prepare_pysam_coverage_args, barcode_tag is {}".format(barcode_tag)) - for bam_filepath in bam_filepaths: # Extract suffix from BAM filename bam_filename = os.path.basename(bam_filepath) @@ -909,7 +1100,17 @@ def prepare_pysam_coverage_args(bam_filepaths, output_folder, output_suffix='', return args_list - +def check_folder_is_empty_warn_if_not(output_folder): + # Check to make sure the folder is empty, otherwise prompt for overwriting + if any(os.scandir(output_folder)): + file_info = [] + for i in os.scandir(output_folder): + file_info.append('\tFile: {}'.format(i)) + + pretty_print("WARNING: {} is not empty\n{}".format(output_folder, + '\n'.join(file_info) + ), style="^") + def make_depth_command_script_single_cell(paired_end, bam_filepaths, output_folder, all_depth_commands=[], output_suffix='', run=False, pivot=False, processes=4, barcode_tag=None): """ @@ -930,7 +1131,7 @@ def make_depth_command_script_single_cell(paired_end, bam_filepaths, output_fold f.write('{}\n\n'.format(d)) if run: - print("Calculating depths using multiprocessing with pysam...") + pretty_print("\nCalculating depths using multiprocessing with pysam...", style='.') run_pysam_count_coverage(pysam_coverage_args, processes) merge_depth_files(output_folder, output_suffix) diff --git a/tests/integration_tests.ipynb b/tests/integration_tests.ipynb index dcaa5f1..1e86212 100644 --- a/tests/integration_tests.ipynb +++ b/tests/integration_tests.ipynb @@ -2889,7 +2889,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "f5c3ed3e-13dd-4399-924e-3d1ac17ce387", "metadata": {}, "outputs": [ @@ -2998,23 +2998,25 @@ "\n", "\n", "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", - "Checking results for long_read_sc_test\n" - ] - }, - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: 'singlecell_tests/long_read_sc_test/final_filtered_site_info_annotated.tsv'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[4], line 267\u001b[0m\n\u001b[1;32m 265\u001b[0m folder \u001b[38;5;241m=\u001b[39m info\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfolder\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 266\u001b[0m final_filtered_site_info_annotated \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m/final_filtered_site_info_annotated.tsv\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(folder, test_name)\n\u001b[0;32m--> 267\u001b[0m final_filtered_site_info_annotated_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfinal_filtered_site_info_annotated\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msep\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;130;43;01m\\t\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtotal_edit_sites\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m info:\n\u001b[1;32m 270\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", - "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1660\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1662\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1663\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1664\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1668\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1669\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1670\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", - "File \u001b[0;32m~/miniconda3/envs/marine_environment/lib/python3.8/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 857\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 863\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 867\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 868\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'singlecell_tests/long_read_sc_test/final_filtered_site_info_annotated.tsv'" + "Checking results for long_read_sc_test\n", + "\tExpecting: {'contig': '6', 'barcode': 'ENSMUST00000203816-AACGTGTTGGAGAGGG-16-G', 'position': 115807969, 'num_rows': 1, 'count': 1, 'coverage': 1, 'conversion': 'A>C', 'strand_conversion': 'T>G', 'strand': '-', 'feature_name': 'Rpl32'}\n", + "\n", + "\t >>> long_read_sc_test passed! <<<\n", + "\n", + "\tExpecting: {'contig': '6', 'barcode': 'ENSMUST00000081840-AAGTCGTACCAGGCTC-40-C', 'position': 115805653, 'num_rows': 1, 'count': 1, 'coverage': 1, 'conversion': 'G>A', 'strand_conversion': 'C>T', 'strand': '-', 'feature_name': 'Rpl32'}\n", + "\n", + "\t >>> long_read_sc_test passed! <<<\n", + "\n", + "\tExpecting: {'contig': '6', 'barcode': 'ENSMUST00000081840-AACGTGTTGGAGAGGG-40-G', 'position': 115807015, 'num_rows': 1, 'count': 1, 'coverage': 8, 'conversion': 'C>T', 'strand_conversion': 'G>A', 'strand': '-', 'feature_name': 'Rpl32'}\n", + "\n", + "\t >>> long_read_sc_test passed! <<<\n", + "\n", + "Checking that analyzing a single-cell dataset in 'bulk' mode (i.e. not specificying the 'CB' barcode) yields the exact same positions and base changes, but with counts and coverages aggregated rather than at a single-cell resolution\n", + "grouped_sc_rows: 62, bulk_rows: 62\n", + "\n", + "\t >>> single-cell and bulk on same dataset comparison passed! <<<\n", + "\n", + "There were 0 failures\n" ] } ],