Skip to content

Commit

Permalink
Merge pull request #47 from YeoLab/1031_optimize_coverage
Browse files Browse the repository at this point in the history
1031 optimize coverage
  • Loading branch information
ekofman authored Nov 5, 2024
2 parents 0392069 + f548f20 commit cc8c6a3
Show file tree
Hide file tree
Showing 13 changed files with 303 additions and 224 deletions.
243 changes: 183 additions & 60 deletions marine.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions marine_environment2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ dependencies:
- rfc3339-validator=0.1.4=py38h06a4308_0
- rfc3986-validator=0.1.1=py38h06a4308_0
- rpds-py=0.10.6=py38hb02cf49_0
- samtools
- scanpy=1.9.6=pyhd8ed1ab_1
- scikit-learn=1.3.2=py38ha25d942_2
- scipy=1.10.1=py38hf6e8229_1
Expand Down
45 changes: 25 additions & 20 deletions src/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@

import os, psutil

def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, verbose=False, cores=64, min_read_quality = 0):
def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", barcode_whitelist=None, contigs=[], num_intervals_per_contig=16, verbose=False, cores=64, min_read_quality=0, min_base_quality=0, dist_from_end=0):

# Make subfolder in which to information about edits
edit_info_subfolder = '{}/edit_info'.format(output_folder)
make_folder(edit_info_subfolder)

edit_finding_jobs = make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag, barcode_whitelist, contigs, num_intervals_per_contig, verbose, min_read_quality)
edit_finding_jobs = make_edit_finding_jobs(bampath, output_folder, strandedness, barcode_tag, barcode_whitelist, contigs, num_intervals_per_contig, verbose, min_read_quality, min_base_quality, dist_from_end)
pretty_print("{} total jobs".format(len(edit_finding_jobs)))

# Performance statistics
Expand Down Expand Up @@ -130,7 +131,7 @@ def write_bam_file(reads, bam_file_name, header_string):
print("Failed at indexing {}, {}".format(bam_file_name, e))


def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_tag="CB", strandedness=0, barcode_whitelist=None, verbose=False, min_read_quality = 0):
def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_tag="CB", strandedness=0, barcode_whitelist=None, verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0):
edit_info_subfolder = '{}/edit_info'.format(output_folder)

time_reporting = {}
Expand All @@ -147,6 +148,8 @@ def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_
reads_for_contig = samfile.fetch(contig, start, end, multiple_iterators=True)

output_file = '{}/{}_{}_{}_{}_edit_info.tsv'.format(edit_info_subfolder, contig, split_index, start, end)
output_bedfile = '{}/{}_{}_{}_{}_edit_positions.bed'.format(edit_info_subfolder, contig, split_index, start, end)

remove_file_if_exists(output_file)

with open(output_file, 'w') as f:
Expand Down Expand Up @@ -177,7 +180,8 @@ def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_
continue

try:
error_code, list_of_rows, num_edits_of_each_type = get_read_information(read, contig, strandedness=strandedness, barcode_tag=barcode_tag, verbose=verbose, min_read_quality=min_read_quality)
error_code, list_of_rows, num_edits_of_each_type = get_read_information(read, contig, strandedness=strandedness, barcode_tag=barcode_tag, verbose=verbose, min_read_quality=min_read_quality, min_base_quality=min_base_quality,
dist_from_end=dist_from_end)
except Exception as e:
print("Failed getting read info on\n{}, {}".format(read.to_string(), e))
break
Expand All @@ -196,7 +200,8 @@ def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_
read_as_string = incorporate_barcode(read_as_string, contig, barcode)

read_lists_for_barcodes[barcode].append(read_as_string)




barcode_to_concatted_reads = {}
if barcode_tag:
Expand Down Expand Up @@ -232,11 +237,14 @@ def find_edits(bampath, contig, split_index, start, end, output_folder, barcode_
return barcode_to_concatted_reads, total_reads, counts, time_reporting


def find_edits_and_split_bams(bampath, contig, split_index, start, end, output_folder, strandedness=0, barcode_tag="CB", barcode_whitelist=None, verbose=False, min_read_quality = 0):
def find_edits_and_split_bams(bampath, contig, split_index, start, end, output_folder, strandedness=0, barcode_tag="CB", barcode_whitelist=None, verbose=False, min_read_quality=0, min_base_quality=0, dist_from_end=0):
barcode_to_concatted_reads, total_reads, counts, time_reporting = find_edits(bampath, contig, split_index,
start, end, output_folder, barcode_tag=barcode_tag, strandedness=strandedness,
start, end, output_folder, barcode_tag=barcode_tag,
strandedness=strandedness,
barcode_whitelist=barcode_whitelist, verbose=verbose,
min_read_quality=min_read_quality
min_read_quality=min_read_quality,
min_base_quality=min_base_quality,
dist_from_end=dist_from_end
)
return barcode_to_concatted_reads, total_reads, counts, time_reporting

Expand All @@ -248,7 +256,7 @@ def find_edits_and_split_bams(bampath, contig, split_index, start, end, output_f
def find_edits_and_split_bams_wrapper(parameters):
try:
start_time = time.perf_counter()
bampath, contig, split_index, start, end, output_folder, strandedness, barcode_tag, barcode_whitelist, verbose, min_read_quality = parameters
bampath, contig, split_index, start, end, output_folder, strandedness, barcode_tag, barcode_whitelist, verbose, min_read_quality, min_base_quality, dist_from_end = parameters
label = '{}({}):{}-{}'.format(contig, split_index, start, end)

barcode_to_concatted_reads, total_reads, counts, time_reporting = find_edits_and_split_bams(
Expand All @@ -262,7 +270,9 @@ def find_edits_and_split_bams_wrapper(parameters):
barcode_tag=barcode_tag,
barcode_whitelist=barcode_whitelist,
verbose=verbose,
min_read_quality=min_read_quality
min_read_quality=min_read_quality,
min_base_quality=min_base_quality,
dist_from_end=dist_from_end
)
counts_df = pd.DataFrame.from_dict(counts)

Expand Down Expand Up @@ -306,15 +316,13 @@ def run_coverage_calculator(edit_info_grouped_per_contig_combined,
barcode_tag='CB',
paired_end=False,
verbose=False,
processes=16,
filters=None,
processes=16
):
coverage_counting_job_params = get_job_params_for_coverage_for_edits_in_contig(
edit_info_grouped_per_contig_combined,
output_folder,
barcode_tag=barcode_tag,
paired_end=paired_end,
filters=filters,
verbose=verbose
)

Expand Down Expand Up @@ -343,12 +351,12 @@ def run_coverage_calculator(edit_info_grouped_per_contig_combined,


def get_job_params_for_coverage_for_edits_in_contig(edit_info_grouped_per_contig_combined, output_folder,
barcode_tag='CB', paired_end=False, filters=None, verbose=False):
barcode_tag='CB', paired_end=False, verbose=False):
job_params = []

for contig, edit_info in edit_info_grouped_per_contig_combined.items():

job_params.append([edit_info, contig, output_folder, barcode_tag, paired_end, filters, verbose])
job_params.append([edit_info, contig, output_folder, barcode_tag, paired_end, verbose])

return job_params

Expand Down Expand Up @@ -379,9 +387,6 @@ def gather_edit_information_across_subcontigs(output_folder, barcode_tag='CB', n
edit_info_df = pd.read_csv(edit_info_file, sep='\t')
edit_info_df['contig'] = edit_info_df['contig'].astype(str)
edit_info_df['position'] = edit_info_df['position'].astype(int)
edit_info_df['base_quality'] = edit_info_df['base_quality'].astype(int)
edit_info_df['mapping_quality'] = edit_info_df['mapping_quality'].astype(int)
edit_info_df['dist_from_end'] = edit_info_df['dist_from_end'].astype(int)

edit_info = pl.from_pandas(edit_info_df)

Expand All @@ -397,7 +402,7 @@ def gather_edit_information_across_subcontigs(output_folder, barcode_tag='CB', n
# For bulk data we don't have to do any filtering, the edits calculated for each split interval
# can be paired directly with the split bam for that interval
edit_info_grouped_per_contig["{}".format(split)].append(edit_info)

del edit_info_df

print("Done grouping! Concatenating ...")
Expand Down Expand Up @@ -444,7 +449,7 @@ def get_count_and_coverage_per_site(all_edit_info, skip_coverage=False):

def generate_site_level_information(all_edit_info, skip_coverage=False):
number_of_edits = len(all_edit_info)

all_edit_info = add_site_id(all_edit_info)

if len(all_edit_info) == 0:
Expand Down
16 changes: 10 additions & 6 deletions src/read_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ def print_read_info(read):

print(str(read))

def get_read_information(read, contig, barcode_tag='CB', verbose=False, strandedness=0, min_read_quality = 0):
def get_read_information(read, contig, barcode_tag='CB', verbose=False, strandedness=0,
min_read_quality=0, min_base_quality=0, dist_from_end=0):
if barcode_tag is None:
read_barcode = 'no_barcode'
elif read.has_tag(barcode_tag):
Expand Down Expand Up @@ -226,12 +227,15 @@ def get_read_information(read, contig, barcode_tag='CB', verbose=False, stranded

distance_from_read_end = np.min([updated_position - reference_start, reference_end - updated_position])

list_of_rows.append([
read_barcode, str(contig), str(updated_position), ref, alt, read_id, reverse_or_forward, str(distance_from_read_end), str(qual), str(mapq)
])
if distance_from_read_end >= dist_from_end and int(qual) >= min_base_quality:

list_of_rows.append([
read_barcode, str(contig), str(updated_position), ref, alt, read_id, reverse_or_forward
])

num_edits_of_each_type['{}>{}'.format(ref, alt)] += 1

num_edits_of_each_type['{}>{}'.format(ref, alt)] += 1


return None, list_of_rows, num_edits_of_each_type


Expand Down
Loading

0 comments on commit cc8c6a3

Please sign in to comment.