Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make all intervals uniform size #48

Merged
merged 1 commit into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 17 additions & 16 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ def delete_intermediate_files(output_folder):
shutil.rmtree(object_path)


def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16,
verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0):
def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[],
verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):

pretty_print("Each contig is being split into {} subsets...".format(num_intervals_per_contig))
pretty_print("Each contig is being split into subsets of length...".format(interval_length))

overall_label_to_list_of_contents, results, overall_time, overall_total_reads, \
total_seconds_for_reads, counts_summary_df = run_edit_identifier(
Expand All @@ -108,12 +108,12 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar
barcode_tag=barcode_tag,
barcode_whitelist=barcode_whitelist,
contigs=contigs,
num_intervals_per_contig=num_intervals_per_contig,
verbose=verbose,
cores=cores,
min_read_quality=min_read_quality,
min_base_quality=min_base_quality,
dist_from_end=dist_from_end
dist_from_end=dist_from_end,
interval_length=interval_length
)

#print(overall_label_to_list_of_contents.keys())
Expand All @@ -139,6 +139,7 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar

def bam_processing(overall_label_to_list_of_contents, output_folder, barcode_tag='CB', cores=1, number_of_expected_bams=4,
verbose=False):
# Only used for single-cell and/or long read reconfiguration of bams to optimize coverage calculation
split_bams_folder = '{}/split_bams'.format(output_folder)
make_folder(split_bams_folder)
contigs_to_generate_bams_for = get_contigs_that_need_bams_written(list(overall_label_to_list_of_contents.keys()),
Expand Down Expand Up @@ -424,10 +425,10 @@ def get_edits_with_coverage_df(output_folder,



def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_intervals_per_contig=16, strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4,
def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strandedness=True, barcode_tag="CB", paired_end=False, barcode_whitelist_file=None, verbose=False, coverage_only=False, filtering_only=False, annotation_only=False, bedgraphs_list=[], sailor_list=[], min_base_quality = 15, min_read_quality = 0, min_dist_from_end = 10, max_edits_per_read = None, cores = 64, number_of_expected_bams=4,
keep_intermediate_files=False,
num_per_sublist=6,
skip_coverage=False):
skip_coverage=False, interval_length=2000000):

print_marine_logo()

Expand All @@ -450,7 +451,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
f.write('barcode_tag\t{}\n'.format(barcode_tag))
f.write('barcode_whitelist_file\t{}\n'.format(barcode_whitelist_file))
f.write('contigs\t{}\n'.format(contigs))
f.write('num_intervals_per_contig\t{}\n'.format(num_intervals_per_contig))
f.write('interval_length\t{}\n'.format(interval_length))
f.write('verbose\t{}\n'.format(verbose))
f.write('cores\t{}\n'.format(cores))
f.write('number_of_expected_bams\t{}\n'.format(number_of_expected_bams))
Expand Down Expand Up @@ -496,12 +497,12 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
barcode_tag,
barcode_whitelist,
subcontig_list,
num_intervals_per_contig,
verbose,
cores=cores,
min_read_quality=min_read_quality,
min_base_quality=min_base_quality,
dist_from_end=min_dist_from_end
dist_from_end=min_dist_from_end,
interval_length=interval_length
)

for k,v in counts_summary_df.items():
Expand Down Expand Up @@ -749,7 +750,8 @@ def check_samtools():
parser.add_argument('--paired_end', dest='paired_end', action='store_true', help='Assess coverage taking without double-counting paired end overlapping regions... slower but more accurate. Edits by default are only counted once for an entire pair, whether they show up on both ends or not.')
parser.add_argument('--skip_coverage', dest='skip_coverage', action='store_true')
parser.add_argument('--max_edits_per_read', type=int, default=None)
parser.add_argument('--num_intervals_per_contig', type=int, default=200, help='Intervals to split analysis into... more intervals can yield faster perforamance especially with multiple cores')
parser.add_argument('--num_intervals_per_contig', type=int, default=200, help='Deprecated')
parser.add_argument('--interval_length', type=int, default=2000000, help='Length of intervals split analysis into...')

args = parser.parse_args()
bam_filepath = args.bam_filepath
Expand Down Expand Up @@ -779,7 +781,7 @@ def check_samtools():
max_edits_per_read = args.max_edits_per_read

num_intervals_per_contig = args.num_intervals_per_contig

interval_length = args.interval_length
num_per_sublist = args.num_per_sublist


Expand Down Expand Up @@ -850,7 +852,7 @@ def check_samtools():
"\tMinimum distance from end:\t{}".format(min_dist_from_end),
"\tMaximum edits per read:\t{}".format(max_edits_per_read),
"\tContigs:\t{}".format(contigs),
"\tNumber of intervals:\t{}".format(num_intervals_per_contig),
"\tInterval length:\t{}".format(interval_length),
"\tCores:\t{}".format(cores),
"\tVerbose:\t{}".format(verbose),
"\tKeep intermediate files:\t{}".format(keep_intermediate_files),
Expand Down Expand Up @@ -879,7 +881,6 @@ def check_samtools():
barcode_tag=barcode_tag,
paired_end=paired_end,
barcode_whitelist_file=barcode_whitelist_file,
num_intervals_per_contig=num_intervals_per_contig,
coverage_only=coverage_only,
filtering_only=filtering_only,
annotation_only=annotation_only,
Expand All @@ -891,10 +892,10 @@ def check_samtools():
max_edits_per_read = max_edits_per_read,
cores = cores,
verbose = verbose,
number_of_expected_bams=num_intervals_per_contig,
skip_coverage=skip_coverage,
keep_intermediate_files=keep_intermediate_files,
num_per_sublist=num_per_sublist
num_per_sublist=num_per_sublist,
interval_length=interval_length
)


4 changes: 2 additions & 2 deletions src/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@

import os, psutil

def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0):
def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):

# Make subfolder in which to information about edits
edit_info_subfolder = '{}/edit_info'.format(output_folder)
make_folder(edit_info_subfolder)

edit_finding_jobs = make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag, barcode_whitelist, contigs, num_intervals_per_contig, verbose, min_read_quality, min_base_quality, dist_from_end)
edit_finding_jobs = make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag, barcode_whitelist, contigs, verbose, min_read_quality, min_base_quality, dist_from_end, interval_length=interval_length)
pretty_print("{} total jobs".format(len(edit_finding_jobs)))

# Performance statistics
Expand Down
17 changes: 12 additions & 5 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def get_contigs_that_need_bams_written(expected_contigs, split_bams_folder, barc
return contigs_to_write_bams_for


def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0):
def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0, interval_length=2000000):

jobs = []

Expand All @@ -104,6 +104,7 @@ def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB
contigs_to_use = set(contig_lengths_dict.keys())
else:
contigs_to_use = set(contigs)

for contig in contig_lengths_dict.keys():

if contig not in contigs_to_use:
Expand All @@ -112,7 +113,7 @@ def make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag="CB
pretty_print("\tContig {}".format(contig))

contig_length = contig_lengths_dict.get(contig)
intervals_for_contig = get_intervals(contig, contig_lengths_dict, num_intervals_per_contig)
intervals_for_contig = get_intervals(contig, contig_lengths_dict, interval_length)
#print('\t\tintervals_for_contig: {}'.format(intervals_for_contig))
# Set up for pool
for split_index, interval in enumerate(intervals_for_contig):
Expand Down Expand Up @@ -178,12 +179,15 @@ def pretty_print(contents, style=''):
# Line after
pretty_print(styled_line)

def get_intervals(contig, contig_lengths_dict, num_intervals=4):
def get_intervals(contig, contig_lengths_dict, interval_length=2000000):
"""
Splits contig coordinates into a list of {num_intervals} coordinates of equal size.
Splits contig coordinates into a list of coordinates of equal size specified by interval length.
"""
contig_length = contig_lengths_dict.get(contig)
interval_length = math.ceil(contig_length/num_intervals)

if interval_length > contig_length:
interval_length = contig_length

start = 0
end = interval_length

Expand All @@ -197,6 +201,9 @@ def get_intervals(contig, contig_lengths_dict, num_intervals=4):

start = end
end = start + interval_length

print("\tContig {}: {} intervals of {} bases".format(contig, len(intervals), interval_length))

return intervals


Expand Down
Loading