Skip to content

Commit

Permalink
Merge pull request #46 from YeoLab/Zero_edit_edge_case
Browse files Browse the repository at this point in the history
empty concat error fix
  • Loading branch information
ekofman authored Oct 15, 2024
2 parents 4e3170c + dbb898b commit 0392069
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 9 deletions.
75 changes: 69 additions & 6 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,50 @@

from annotate import annotate_sites, get_strand_specific_conversion

def zero_edit_found(final_site_level_information_df, output_folder, sailor_list, bedgraphs_list, keep_intermediate_files, start_time, logging_folder):
print("No edits found.")
Sites_columns = ['site_id','barcode','contig','position','ref','alt','strand','count','coverage','conversion','strand_conversion']
sites_empty_df = pd.DataFrame(columns=Sites_columns)
sites_empty_df.to_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t', index=False)

Annotated_sites_columns = ['site_id','barcode','contig','position','ref','alt','strand','count','coverage','conversion','strand_conversion','feature_name','feature_type','feature_strand']
annotated_sites_empty_df = pd.DataFrame(columns=Annotated_sites_columns)
annotated_sites_empty_df.to_csv('{}/final_filtered_site_info_annotated.tsv'.format(output_folder), sep='\t', index=False)

empty_df = pd.DataFrame()
if len(sailor_list) > 0:
for conversion in sailor_list:
conversion_search = conversion[0] + '>' + conversion[1]
empty_df.to_csv('{}/sailor_style_sites_{}.bed'.format(
output_folder,
conversion_search.replace(">", "-")),
header=False, index=False, sep='\t')

if len(bedgraphs_list) > 0:
bedgraph_folder = '{}/bedgraphs'.format(output_folder)
make_folder(bedgraph_folder)
for conversion in bedgraphs_list:
conversion_search = conversion[0] + '>' + conversion[1]
empty_df.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)

current, peak = tracemalloc.get_traced_memory()

with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'sites\t{len(final_site_level_information_df)}\n')
f.write(f'peak_memory_mb\t{peak/1e6}\n')
f.write(f'time_elapsed_seconds\t{time.time()-start_time:.2f}s\n')

print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB")
print(f'Time elapsed: {time.time()-start_time:.2f}s')

if not keep_intermediate_files:
pretty_print("Deleting intermediate files...", style="-")
delete_intermediate_files(output_folder)

pretty_print("Done!", style="+")
return 'Done'


def delete_intermediate_files(output_folder):
to_delete = ['coverage', 'edit_info', 'split_bams', 'all_edit_info.tsv',
'concat_command.sh', 'final_edit_info.tsv', 'final_filtered_edit_info.tsv']
Expand Down Expand Up @@ -67,7 +111,6 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar
cores=cores,
min_read_quality=min_read_quality
)


#print(overall_label_to_list_of_contents.keys())
#print(overall_label_to_list_of_contents.get(list(overall_label_to_list_of_contents.keys())[0]))
Expand Down Expand Up @@ -404,10 +447,16 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in

all_filter_stats = pd.DataFrame([r[1] for r in results]).sum(axis=0)
print(all_filter_stats)

if len(all_filter_stats) == 0:
original_edits = float(0)
filtered_edits = float(0)
else:
original_edits = float(all_filter_stats.loc["original"])
filtered_edits = float(all_filter_stats.loc["filtered"])

with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'original_edits\t{float(all_filter_stats.loc["original"])}\n')
f.write(f'filtered_edits\t{float(all_filter_stats.loc["filtered"])}\n')
f.write(f'original_edits\t{original_edits}\n')
f.write(f'filtered_edits\t{filtered_edits}\n')

pretty_print("Total time to calculate coverage: {} minutes".format(round(total_time/60, 3)))

Expand Down Expand Up @@ -447,10 +496,16 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
final_site_level_information_df = generate_site_level_information(all_edit_info_filtered_pl, skip_coverage=skip_coverage)
pretty_print("\tNumber of unique edit sites:\n\t{}".format(len(final_site_level_information_df)))
pretty_print("Writing sites...\n")

# Edge case when no edits are found.
if len(final_site_level_information_df) == 0:
output_zero_edit_files = zero_edit_found(final_site_level_information_df, output_folder, sailor_list, bedgraphs_list, keep_intermediate_files, start_time, logging_folder)
return 'Done!'

final_site_level_information_df.write_csv('{}/final_filtered_site_info.tsv'.format(output_folder),
separator='\t')


pretty_print("Adding strand-specific conversion...\n")
final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder),
sep='\t')
Expand Down Expand Up @@ -503,6 +558,14 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
if not annotation_bedfile_path:
print("annotation_bedfile_path argument not provided ...\
not annotating with feature information and strand-specific conversions.")

if final_path_already_exists:
# Edge case when no edits are found.
final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder),
sep='\t')
if len(final_site_level_information_df) == 0:
output_zero_edit_files = zero_edit_found(final_site_level_information_df, output_folder, sailor_list, bedgraphs_list, keep_intermediate_files, start_time, logging_folder)
return 'Done!'

if final_path_already_exists and annotation_bedfile_path:
final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder),
Expand Down
14 changes: 11 additions & 3 deletions src/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB",

all_counts_summary_dfs = []
overall_count_summary_dict = defaultdict(lambda:0)

#multiprocessing.set_start_method('spawn')
with get_context("spawn").Pool(processes=cores, maxtasksperchild=4) as p:
max_ = len(edit_finding_jobs)
Expand All @@ -58,7 +57,6 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB",
if barcode_tag:
# Only keep this information for single cell requirements
overall_label_to_list_of_contents[_[0]][_[1]] = _[2]

total_reads = _[3]
counts_summary_df = _[4]
all_counts_summary_dfs.append(counts_summary_df)
Expand All @@ -70,13 +68,16 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB",
total_seconds_for_reads[overall_total_reads] = total_time

overall_time = time.perf_counter() - start_time

if len(all_counts_summary_dfs) == 0:
return overall_label_to_list_of_contents, results, overall_time, overall_total_reads, total_seconds_for_reads, pd.DataFrame()

all_counts_summary_dfs_combined = pd.concat(all_counts_summary_dfs, axis=1)
#print(all_counts_summary_dfs_combined.index, all_counts_summary_dfs_combined.columns)

overall_count_summary_df = pd.DataFrame.from_dict(all_counts_summary_dfs_combined).sum(axis=1)
#print(overall_count_summary_df)

return overall_label_to_list_of_contents, results, overall_time, overall_total_reads, total_seconds_for_reads, overall_count_summary_df


Expand Down Expand Up @@ -412,6 +413,9 @@ def gather_edit_information_across_subcontigs(output_folder, barcode_tag='CB', n


def add_site_id(all_edit_info):
if len(all_edit_info) == 0:
return all_edit_info

return all_edit_info.with_columns(
pl.concat_str(
[
Expand Down Expand Up @@ -442,6 +446,10 @@ def generate_site_level_information(all_edit_info, skip_coverage=False):
number_of_edits = len(all_edit_info)

all_edit_info = add_site_id(all_edit_info)

if len(all_edit_info) == 0:
print("DataFrame is empty. Returning an empty DataFrame.")
return all_edit_info

identifying_values = ["site_id", "barcode", "contig", "position", "ref", "alt", "strand"]

Expand Down

0 comments on commit 0392069

Please sign in to comment.