From e6fa5a11c16480f42a63a5f3829da9c74c25ffd9 Mon Sep 17 00:00:00 2001 From: Eric Kofman Date: Thu, 7 Nov 2024 12:30:29 -0800 Subject: [PATCH] make all intervals uniform size --- marine.py | 33 +++++++++++++++++---------------- src/core.py | 4 ++-- src/utils.py | 17 ++++++++++++----- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/marine.py b/marine.py index 3dfc620..ab850b0 100755 --- a/marine.py +++ b/marine.py @@ -95,10 +95,10 @@ def delete_intermediate_files(output_folder): shutil.rmtree(object_path) -def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, - verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0): +def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], + verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000): - pretty_print("Each contig is being split into {} subsets...".format(num_intervals_per_contig)) + pretty_print("Each contig is being split into subsets of length...".format(interval_length)) overall_label_to_list_of_contents, results, overall_time, overall_total_reads, \ total_seconds_for_reads, counts_summary_df = run_edit_identifier( @@ -108,12 +108,12 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar barcode_tag=barcode_tag, barcode_whitelist=barcode_whitelist, contigs=contigs, - num_intervals_per_contig=num_intervals_per_contig, verbose=verbose, cores=cores, min_read_quality=min_read_quality, min_base_quality=min_base_quality, - dist_from_end=dist_from_end + dist_from_end=dist_from_end, + interval_length=interval_length ) #print(overall_label_to_list_of_contents.keys()) @@ -139,6 +139,7 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag='CB', cores=1, number_of_expected_bams=4, verbose=False): + # Only used for single-cell and/or long read reconfiguration of bams to optimize coverage calculation split_bams_folder = '{}/split_bams'.format(output_folder) make_folder(split_bams_folder) contigs_to_generate_bams_for = get_contigs_that_need_bams_written(list(overall_label_to_list_of_contents.keys()), @@ -424,10 +425,10 @@ def get_edits_with_coverage_df(output_folder, -def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_intervals_per_contig=16, strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4, +def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4, keep_intermediate_files=False, num_per_sublist=6, - skip_coverage=False): + skip_coverage=False, interval_length=2000000): print_marine_logo() @@ -450,7 +451,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in f.write('barcode_tag\t{}\n'.format(barcode_tag)) f.write('barcode_whitelist_file\t{}\n'.format(barcode_whitelist_file)) f.write('contigs\t{}\n'.format(contigs)) - f.write('num_intervals_per_contig\t{}\n'.format(num_intervals_per_contig)) + f.write('interval_length\t{}\n'.format(interval_length)) f.write('verbose\t{}\n'.format(verbose)) f.write('cores\t{}\n'.format(cores)) f.write('number_of_expected_bams\t{}\n'.format(number_of_expected_bams)) @@ -496,12 +497,12 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in barcode_tag, barcode_whitelist, subcontig_list, - num_intervals_per_contig, verbose, cores=cores, min_read_quality=min_read_quality, min_base_quality=min_base_quality, - dist_from_end=min_dist_from_end + dist_from_end=min_dist_from_end, + interval_length=interval_length ) for k,v in counts_summary_df.items(): @@ -749,7 +750,8 @@ def check_samtools(): parser.add_argument('--paired_end', dest='paired_end', action='store_true', help='Assess coverage taking without double-counting paired end overlapping regions... slower but more accurate. Edits by default are only counted once for an entire pair, whether they show up on both ends or not.') parser.add_argument('--skip_coverage', dest='skip_coverage', action='store_true') parser.add_argument('--max_edits_per_read', type=int, default=None) - parser.add_argument('--num_intervals_per_contig', type=int, default=200, help='Intervals to split analysis into... more intervals can yield faster perforamance especially with multiple cores') + parser.add_argument('--num_intervals_per_contig', type=int, default=200, help='Deprecated') + parser.add_argument('--interval_length', type=int, default=2000000, help='Length of intervals split analysis into...') args = parser.parse_args() bam_filepath = args.bam_filepath @@ -779,7 +781,7 @@ def check_samtools(): max_edits_per_read = args.max_edits_per_read num_intervals_per_contig = args.num_intervals_per_contig - + interval_length = args.interval_length num_per_sublist = args.num_per_sublist @@ -850,7 +852,7 @@ def check_samtools(): "\tMinimum distance from end:\t{}".format(min_dist_from_end), "\tMaximum edits per read:\t{}".format(max_edits_per_read), "\tContigs:\t{}".format(contigs), - "\tNumber of intervals:\t{}".format(num_intervals_per_contig), + "\tInterval length:\t{}".format(interval_length), "\tCores:\t{}".format(cores), "\tVerbose:\t{}".format(verbose), "\tKeep intermediate files:\t{}".format(keep_intermediate_files), @@ -879,7 +881,6 @@ def check_samtools(): barcode_tag=barcode_tag, paired_end=paired_end, barcode_whitelist_file=barcode_whitelist_file, - num_intervals_per_contig=num_intervals_per_contig, coverage_only=coverage_only, filtering_only=filtering_only, annotation_only=annotation_only, @@ -891,10 +892,10 @@ def check_samtools(): max_edits_per_read = max_edits_per_read, cores = cores, verbose = verbose, - number_of_expected_bams=num_intervals_per_contig, skip_coverage=skip_coverage, keep_intermediate_files=keep_intermediate_files, - num_per_sublist=num_per_sublist + num_per_sublist=num_per_sublist, + interval_length=interval_length ) \ No newline at end of file diff --git a/src/core.py b/src/core.py index f1fc973..b96123c 100644 --- a/src/core.py +++ b/src/core.py @@ -23,13 +23,13 @@ import os, psutil -def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0): +def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000): # Make subfolder in which to information about edits edit_info_subfolder = '{}/edit_info'.format(output_folder) make_folder(edit_info_subfolder) - edit_finding_jobs = make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag, barcode_whitelist, contigs, num_intervals_per_contig, verbose, min_read_quality, min_base_quality, dist_from_end) + edit_finding_jobs = make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag, barcode_whitelist, contigs, verbose, min_read_quality, min_base_quality, dist_from_end, interval_length=interval_length) pretty_print("{} total jobs".format(len(edit_finding_jobs))) # Performance statistics diff --git a/src/utils.py b/src/utils.py index 93eaae3..285b558 100644 --- a/src/utils.py +++ b/src/utils.py @@ -91,7 +91,7 @@ def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barc return contigs_to_write_bams_for -def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0): +def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000): jobs = [] @@ -104,6 +104,7 @@ def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB contigs_to_use = set(contig_lengths_dict.keys()) else: contigs_to_use = set(contigs) + for contig in contig_lengths_dict.keys(): if contig not in contigs_to_use: @@ -112,7 +113,7 @@ def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB pretty_print("\tContig {}".format(contig)) contig_length = contig_lengths_dict.get(contig) - intervals_for_contig = get_intervals(contig, contig_lengths_dict, num_intervals_per_contig) + intervals_for_contig = get_intervals(contig, contig_lengths_dict, interval_length) #print('\t\tintervals_for_contig: {}'.format(intervals_for_contig)) # Set up for pool for split_index, interval in enumerate(intervals_for_contig): @@ -178,12 +179,15 @@ def pretty_print(contents, style=''): # Line after pretty_print(styled_line) -def get_intervals(contig, contig_lengths_dict, num_intervals=4): +def get_intervals(contig, contig_lengths_dict, interval_length=2000000): """ - Splits contig coordinates into a list of {num_intervals} coordinates of equal size. + Splits contig coordinates into a list of coordinates of equal size specified by interval length. """ contig_length = contig_lengths_dict.get(contig) - interval_length = math.ceil(contig_length/num_intervals) + + if interval_length > contig_length: + interval_length = contig_length + start = 0 end = interval_length @@ -197,6 +201,9 @@ def get_intervals(contig, contig_lengths_dict, num_intervals=4): start = end end = start + interval_length + + print("\tContig {}: {} intervals of {} bases".format(contig, len(intervals), interval_length)) + return intervals