Skip to content

Commit

Permalink
make all intervals uniform size
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Kofman committed Nov 7, 2024
1 parent d64223f commit e6fa5a1
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 23 deletions.
33 changes: 17 additions & 16 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ def delete_intermediate_files(output_folder):
shutil.rmtree(object_path)


def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16,
verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0):
def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[],
verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):

pretty_print("Each contig is being split into {} subsets...".format(num_intervals_per_contig))
pretty_print("Each contig is being split into subsets of length...".format(interval_length))

overall_label_to_list_of_contents, results, overall_time, overall_total_reads, \
total_seconds_for_reads, counts_summary_df = run_edit_identifier(
Expand All @@ -108,12 +108,12 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar
barcode_tag=barcode_tag,
barcode_whitelist=barcode_whitelist,
contigs=contigs,
num_intervals_per_contig=num_intervals_per_contig,
verbose=verbose,
cores=cores,
min_read_quality=min_read_quality,
min_base_quality=min_base_quality,
dist_from_end=dist_from_end
dist_from_end=dist_from_end,
interval_length=interval_length
)

#print(overall_label_to_list_of_contents.keys())
Expand All @@ -139,6 +139,7 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar

def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag='CB', cores=1, number_of_expected_bams=4,
verbose=False):
# Only used for single-cell and/or long read reconfiguration of bams to optimize coverage calculation
split_bams_folder = '{}/split_bams'.format(output_folder)
make_folder(split_bams_folder)
contigs_to_generate_bams_for = get_contigs_that_need_bams_written(list(overall_label_to_list_of_contents.keys()),
Expand Down Expand Up @@ -424,10 +425,10 @@ def get_edits_with_coverage_df(output_folder,



def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_intervals_per_contig=16, strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4,
def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4,
keep_intermediate_files=False,
num_per_sublist=6,
skip_coverage=False):
skip_coverage=False, interval_length=2000000):

print_marine_logo()

Expand All @@ -450,7 +451,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
f.write('barcode_tag\t{}\n'.format(barcode_tag))
f.write('barcode_whitelist_file\t{}\n'.format(barcode_whitelist_file))
f.write('contigs\t{}\n'.format(contigs))
f.write('num_intervals_per_contig\t{}\n'.format(num_intervals_per_contig))
f.write('interval_length\t{}\n'.format(interval_length))
f.write('verbose\t{}\n'.format(verbose))
f.write('cores\t{}\n'.format(cores))
f.write('number_of_expected_bams\t{}\n'.format(number_of_expected_bams))
Expand Down Expand Up @@ -496,12 +497,12 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
barcode_tag,
barcode_whitelist,
subcontig_list,
num_intervals_per_contig,
verbose,
cores=cores,
min_read_quality=min_read_quality,
min_base_quality=min_base_quality,
dist_from_end=min_dist_from_end
dist_from_end=min_dist_from_end,
interval_length=interval_length
)

for k,v in counts_summary_df.items():
Expand Down Expand Up @@ -749,7 +750,8 @@ def check_samtools():
parser.add_argument('--paired_end', dest='paired_end', action='store_true', help='Assess coverage taking without double-counting paired end overlapping regions... slower but more accurate. Edits by default are only counted once for an entire pair, whether they show up on both ends or not.')
parser.add_argument('--skip_coverage', dest='skip_coverage', action='store_true')
parser.add_argument('--max_edits_per_read', type=int, default=None)
parser.add_argument('--num_intervals_per_contig', type=int, default=200, help='Intervals to split analysis into... more intervals can yield faster perforamance especially with multiple cores')
parser.add_argument('--num_intervals_per_contig', type=int, default=200, help='Deprecated')
parser.add_argument('--interval_length', type=int, default=2000000, help='Length of intervals split analysis into...')

args = parser.parse_args()
bam_filepath = args.bam_filepath
Expand Down Expand Up @@ -779,7 +781,7 @@ def check_samtools():
max_edits_per_read = args.max_edits_per_read

num_intervals_per_contig = args.num_intervals_per_contig

interval_length = args.interval_length
num_per_sublist = args.num_per_sublist


Expand Down Expand Up @@ -850,7 +852,7 @@ def check_samtools():
"\tMinimum distance from end:\t{}".format(min_dist_from_end),
"\tMaximum edits per read:\t{}".format(max_edits_per_read),
"\tContigs:\t{}".format(contigs),
"\tNumber of intervals:\t{}".format(num_intervals_per_contig),
"\tInterval length:\t{}".format(interval_length),
"\tCores:\t{}".format(cores),
"\tVerbose:\t{}".format(verbose),
"\tKeep intermediate files:\t{}".format(keep_intermediate_files),
Expand Down Expand Up @@ -879,7 +881,6 @@ def check_samtools():
barcode_tag=barcode_tag,
paired_end=paired_end,
barcode_whitelist_file=barcode_whitelist_file,
num_intervals_per_contig=num_intervals_per_contig,
coverage_only=coverage_only,
filtering_only=filtering_only,
annotation_only=annotation_only,
Expand All @@ -891,10 +892,10 @@ def check_samtools():
max_edits_per_read = max_edits_per_read,
cores = cores,
verbose = verbose,
number_of_expected_bams=num_intervals_per_contig,
skip_coverage=skip_coverage,
keep_intermediate_files=keep_intermediate_files,
num_per_sublist=num_per_sublist
num_per_sublist=num_per_sublist,
interval_length=interval_length
)


4 changes: 2 additions & 2 deletions src/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@

import os, psutil

def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0):
def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):

# Make subfolder in which to information about edits
edit_info_subfolder = '{}/edit_info'.format(output_folder)
make_folder(edit_info_subfolder)

edit_finding_jobs = make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag, barcode_whitelist, contigs, num_intervals_per_contig, verbose, min_read_quality, min_base_quality, dist_from_end)
edit_finding_jobs = make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag, barcode_whitelist, contigs, verbose, min_read_quality, min_base_quality, dist_from_end, interval_length=interval_length)
pretty_print("{} total jobs".format(len(edit_finding_jobs)))

# Performance statistics
Expand Down
17 changes: 12 additions & 5 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barc
return contigs_to_write_bams_for


def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0):
def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):

jobs = []

Expand All @@ -104,6 +104,7 @@ def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB
contigs_to_use = set(contig_lengths_dict.keys())
else:
contigs_to_use = set(contigs)

for contig in contig_lengths_dict.keys():

if contig not in contigs_to_use:
Expand All @@ -112,7 +113,7 @@ def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB
pretty_print("\tContig {}".format(contig))

contig_length = contig_lengths_dict.get(contig)
intervals_for_contig = get_intervals(contig, contig_lengths_dict, num_intervals_per_contig)
intervals_for_contig = get_intervals(contig, contig_lengths_dict, interval_length)
#print('\t\tintervals_for_contig: {}'.format(intervals_for_contig))
# Set up for pool
for split_index, interval in enumerate(intervals_for_contig):
Expand Down Expand Up @@ -178,12 +179,15 @@ def pretty_print(contents, style=''):
# Line after
pretty_print(styled_line)

def get_intervals(contig, contig_lengths_dict, num_intervals=4):
def get_intervals(contig, contig_lengths_dict, interval_length=2000000):
"""
Splits contig coordinates into a list of {num_intervals} coordinates of equal size.
Splits contig coordinates into a list of coordinates of equal size specified by interval length.
"""
contig_length = contig_lengths_dict.get(contig)
interval_length = math.ceil(contig_length/num_intervals)

if interval_length > contig_length:
interval_length = contig_length

start = 0
end = interval_length

Expand All @@ -197,6 +201,9 @@ def get_intervals(contig, contig_lengths_dict, num_intervals=4):

start = end
end = start + interval_length

print("\tContig {}: {} intervals of {} bases".format(contig, len(intervals), interval_length))

return intervals


Expand Down

0 comments on commit e6fa5a1

Please sign in to comment.