Skip to content

Commit

Permalink
empty concat error fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Shuhao Xu committed Oct 11, 2024
1 parent 4e3170c commit b4ee7b4
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 9 deletions.
97 changes: 91 additions & 6 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar
cores=cores,
min_read_quality=min_read_quality
)


#print(overall_label_to_list_of_contents.keys())
#print(overall_label_to_list_of_contents.get(list(overall_label_to_list_of_contents.keys())[0]))
Expand Down Expand Up @@ -404,10 +403,14 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in

all_filter_stats = pd.DataFrame([r[1] for r in results]).sum(axis=0)
print(all_filter_stats)

with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'original_edits\t{float(all_filter_stats.loc["original"])}\n')
f.write(f'filtered_edits\t{float(all_filter_stats.loc["filtered"])}\n')
if len(all_filter_stats) == 0:
with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'original_edits\t{float(0)}\n')
f.write(f'filtered_edits\t{float(0)}\n')
else:
with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'original_edits\t{float(all_filter_stats.loc["original"])}\n')
f.write(f'filtered_edits\t{float(all_filter_stats.loc["filtered"])}\n')

pretty_print("Total time to calculate coverage: {} minutes".format(round(total_time/60, 3)))

Expand Down Expand Up @@ -450,7 +453,47 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
final_site_level_information_df.write_csv('{}/final_filtered_site_info.tsv'.format(output_folder),
separator='\t')


# Edge case when no edits are found.
if len(final_site_level_information_df) == 0:
print("No edits found.")
columns = ['contig', 'position', 'ref', 'alt', 'read_id', 'strand', 'mapping_quality', 'coverage', 'site_id']
empty_df = pd.DataFrame(columns=columns)
empty_df.to_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t', index=False)
empty_df.to_csv('{}/final_filtered_site_info_annotated.tsv'.format(output_folder), sep='\t', index=False)
if len(sailor_list) > 0:
for conversion in sailor_list:
conversion_search = conversion[0] + '>' + conversion[1]
empty_df.to_csv('{}/sailor_style_sites_{}.bed'.format(
output_folder,
conversion_search.replace(">", "-")),
header=False, index=False, sep='\t')

if len(bedgraphs_list) > 0:
bedgraph_folder = '{}/bedgraphs'.format(output_folder)
make_folder(bedgraph_folder)
for conversion in bedgraphs_list:
conversion_search = conversion[0] + '>' + conversion[1]
empty_df.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)

current, peak = tracemalloc.get_traced_memory()

with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'sites\t{len(final_site_level_information_df)}\n')
f.write(f'peak_memory_mb\t{peak/1e6}\n')
f.write(f'time_elapsed_seconds\t{time.time()-start_time:.2f}s\n')

print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB")
print(f'Time elapsed: {time.time()-start_time:.2f}s')

if not keep_intermediate_files:
pretty_print("Deleting intermediate files...", style="-")
delete_intermediate_files(output_folder)

pretty_print("Done!", style="+")
return 'Done'



pretty_print("Adding strand-specific conversion...\n")
final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder),
sep='\t')
Expand Down Expand Up @@ -503,6 +546,48 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
if not annotation_bedfile_path:
print("annotation_bedfile_path argument not provided ...\
not annotating with feature information and strand-specific conversions.")

if final_path_already_exists:
# Edge case when no edits are found.
final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder),
sep='\t')
if len(final_site_level_information_df) == 0:
columns = ['contig', 'position', 'ref', 'alt', 'read_id', 'strand', 'mapping_quality', 'coverage', 'site_id']
empty_df = pd.DataFrame(columns=columns)
print("No edits found.")
empty_df.to_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t', index=False)
empty_df.to_csv('{}/final_filtered_site_info_annotated.tsv'.format(output_folder), sep='\t', index=False)
if len(sailor_list) > 0:
for conversion in sailor_list:
conversion_search = conversion[0] + '>' + conversion[1]
empty_df.to_csv('{}/sailor_style_sites_{}.bed'.format(
output_folder,
conversion_search.replace(">", "-")),
header=False, index=False, sep='\t')

if len(bedgraphs_list) > 0:
bedgraph_folder = '{}/bedgraphs'.format(output_folder)
make_folder(bedgraph_folder)
for conversion in bedgraphs_list:
conversion_search = conversion[0] + '>' + conversion[1]
empty_df.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False)

current, peak = tracemalloc.get_traced_memory()

with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'sites\t{len(final_site_level_information_df)}\n')
f.write(f'peak_memory_mb\t{peak/1e6}\n')
f.write(f'time_elapsed_seconds\t{time.time()-start_time:.2f}s\n')

print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB")
print(f'Time elapsed: {time.time()-start_time:.2f}s')

if not keep_intermediate_files:
pretty_print("Deleting intermediate files...", style="-")
delete_intermediate_files(output_folder)

pretty_print("Done!", style="+")
return 'Done'

if final_path_already_exists and annotation_bedfile_path:
final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder),
Expand Down
17 changes: 14 additions & 3 deletions src/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB",

all_counts_summary_dfs = []
overall_count_summary_dict = defaultdict(lambda:0)

count = 0
#multiprocessing.set_start_method('spawn')
with get_context("spawn").Pool(processes=cores, maxtasksperchild=4) as p:
max_ = len(edit_finding_jobs)
Expand All @@ -58,7 +58,6 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB",
if barcode_tag:
# Only keep this information for single cell requirements
overall_label_to_list_of_contents[_[0]][_[1]] = _[2]

total_reads = _[3]
counts_summary_df = _[4]
all_counts_summary_dfs.append(counts_summary_df)
Expand All @@ -70,13 +69,16 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB",
total_seconds_for_reads[overall_total_reads] = total_time

overall_time = time.perf_counter() - start_time

if len(all_counts_summary_dfs) == 0:
return overall_label_to_list_of_contents, results, overall_time, overall_total_reads, total_seconds_for_reads, pd.DataFrame()

all_counts_summary_dfs_combined = pd.concat(all_counts_summary_dfs, axis=1)
#print(all_counts_summary_dfs_combined.index, all_counts_summary_dfs_combined.columns)

overall_count_summary_df = pd.DataFrame.from_dict(all_counts_summary_dfs_combined).sum(axis=1)
#print(overall_count_summary_df)

return overall_label_to_list_of_contents, results, overall_time, overall_total_reads, total_seconds_for_reads, overall_count_summary_df


Expand Down Expand Up @@ -412,6 +414,11 @@ def gather_edit_information_across_subcontigs(output_folder, barcode_tag='CB', n


def add_site_id(all_edit_info):
if len(all_edit_info) == 0:
print("DataFrame is empty. Returning an empty DataFrame.")
all_edit_info = all_edit_info.with_columns(pl.Series(name='site_id', values=[]))
return all_edit_info

return all_edit_info.with_columns(
pl.concat_str(
[
Expand Down Expand Up @@ -442,6 +449,10 @@ def generate_site_level_information(all_edit_info, skip_coverage=False):
number_of_edits = len(all_edit_info)

all_edit_info = add_site_id(all_edit_info)

if len(all_edit_info) == 0:
print("DataFrame is empty. Returning an empty DataFrame.")
return all_edit_info

identifying_values = ["site_id", "barcode", "contig", "position", "ref", "alt", "strand"]

Expand Down

0 comments on commit b4ee7b4

Please sign in to comment.