From b4ee7b43c3e90193a5875d70af924d15b92a9d9d Mon Sep 17 00:00:00 2001 From: Shuhao Xu Date: Fri, 11 Oct 2024 14:21:29 -0700 Subject: [PATCH] empty concat error fix --- marine.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++---- src/core.py | 17 ++++++++-- 2 files changed, 105 insertions(+), 9 deletions(-) diff --git a/marine.py b/marine.py index 2be3029..6c2299d 100755 --- a/marine.py +++ b/marine.py @@ -67,7 +67,6 @@ def edit_finder(bam_filepath, output_folder, strandedness, barcode_tag="CB", bar cores=cores, min_read_quality=min_read_quality ) - #print(overall_label_to_list_of_contents.keys()) #print(overall_label_to_list_of_contents.get(list(overall_label_to_list_of_contents.keys())[0])) @@ -404,10 +403,14 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in all_filter_stats = pd.DataFrame([r[1] for r in results]).sum(axis=0) print(all_filter_stats) - - with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: - f.write(f'original_edits\t{float(all_filter_stats.loc["original"])}\n') - f.write(f'filtered_edits\t{float(all_filter_stats.loc["filtered"])}\n') + if len(all_filter_stats) == 0: + with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: + f.write(f'original_edits\t{float(0)}\n') + f.write(f'filtered_edits\t{float(0)}\n') + else: + with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: + f.write(f'original_edits\t{float(all_filter_stats.loc["original"])}\n') + f.write(f'filtered_edits\t{float(all_filter_stats.loc["filtered"])}\n') pretty_print("Total time to calculate coverage: {} minutes".format(round(total_time/60, 3))) @@ -450,7 +453,47 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in final_site_level_information_df.write_csv('{}/final_filtered_site_info.tsv'.format(output_folder), separator='\t') - + # Edge case when no edits are found. + if len(final_site_level_information_df) == 0: + print("No edits found.") + columns = ['contig', 'position', 'ref', 'alt', 'read_id', 'strand', 'mapping_quality', 'coverage', 'site_id'] + empty_df = pd.DataFrame(columns=columns) + empty_df.to_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t', index=False) + empty_df.to_csv('{}/final_filtered_site_info_annotated.tsv'.format(output_folder), sep='\t', index=False) + if len(sailor_list) > 0: + for conversion in sailor_list: + conversion_search = conversion[0] + '>' + conversion[1] + empty_df.to_csv('{}/sailor_style_sites_{}.bed'.format( + output_folder, + conversion_search.replace(">", "-")), + header=False, index=False, sep='\t') + + if len(bedgraphs_list) > 0: + bedgraph_folder = '{}/bedgraphs'.format(output_folder) + make_folder(bedgraph_folder) + for conversion in bedgraphs_list: + conversion_search = conversion[0] + '>' + conversion[1] + empty_df.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False) + + current, peak = tracemalloc.get_traced_memory() + + with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: + f.write(f'sites\t{len(final_site_level_information_df)}\n') + f.write(f'peak_memory_mb\t{peak/1e6}\n') + f.write(f'time_elapsed_seconds\t{time.time()-start_time:.2f}s\n') + + print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB") + print(f'Time elapsed: {time.time()-start_time:.2f}s') + + if not keep_intermediate_files: + pretty_print("Deleting intermediate files...", style="-") + delete_intermediate_files(output_folder) + + pretty_print("Done!", style="+") + return 'Done' + + + pretty_print("Adding strand-specific conversion...\n") final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t') @@ -503,6 +546,48 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in if not annotation_bedfile_path: print("annotation_bedfile_path argument not provided ...\ not annotating with feature information and strand-specific conversions.") + + if final_path_already_exists: + # Edge case when no edits are found. + final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder), + sep='\t') + if len(final_site_level_information_df) == 0: + columns = ['contig', 'position', 'ref', 'alt', 'read_id', 'strand', 'mapping_quality', 'coverage', 'site_id'] + empty_df = pd.DataFrame(columns=columns) + print("No edits found.") + empty_df.to_csv('{}/final_filtered_site_info.tsv'.format(output_folder), sep='\t', index=False) + empty_df.to_csv('{}/final_filtered_site_info_annotated.tsv'.format(output_folder), sep='\t', index=False) + if len(sailor_list) > 0: + for conversion in sailor_list: + conversion_search = conversion[0] + '>' + conversion[1] + empty_df.to_csv('{}/sailor_style_sites_{}.bed'.format( + output_folder, + conversion_search.replace(">", "-")), + header=False, index=False, sep='\t') + + if len(bedgraphs_list) > 0: + bedgraph_folder = '{}/bedgraphs'.format(output_folder) + make_folder(bedgraph_folder) + for conversion in bedgraphs_list: + conversion_search = conversion[0] + '>' + conversion[1] + empty_df.to_csv('{}/{}_{}.bedgraph'.format(bedgraph_folder, output_folder.split('/')[-1], conversion), sep='\t', index=False, header=False) + + current, peak = tracemalloc.get_traced_memory() + + with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: + f.write(f'sites\t{len(final_site_level_information_df)}\n') + f.write(f'peak_memory_mb\t{peak/1e6}\n') + f.write(f'time_elapsed_seconds\t{time.time()-start_time:.2f}s\n') + + print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB") + print(f'Time elapsed: {time.time()-start_time:.2f}s') + + if not keep_intermediate_files: + pretty_print("Deleting intermediate files...", style="-") + delete_intermediate_files(output_folder) + + pretty_print("Done!", style="+") + return 'Done' if final_path_already_exists and annotation_bedfile_path: final_site_level_information_df = pd.read_csv('{}/final_filtered_site_info.tsv'.format(output_folder), diff --git a/src/core.py b/src/core.py index ab0cf52..e1bbae1 100644 --- a/src/core.py +++ b/src/core.py @@ -43,7 +43,7 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", all_counts_summary_dfs = [] overall_count_summary_dict = defaultdict(lambda:0) - + count = 0 #multiprocessing.set_start_method('spawn') with get_context("spawn").Pool(processes=cores, maxtasksperchild=4) as p: max_ = len(edit_finding_jobs) @@ -58,7 +58,6 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", if barcode_tag: # Only keep this information for single cell requirements overall_label_to_list_of_contents[_[0]][_[1]] = _[2] - total_reads = _[3] counts_summary_df = _[4] all_counts_summary_dfs.append(counts_summary_df) @@ -70,13 +69,16 @@ def run_edit_identifier(bampath, output_folder, strandedness, barcode_tag="CB", total_seconds_for_reads[overall_total_reads] = total_time overall_time = time.perf_counter() - start_time + + if len(all_counts_summary_dfs) == 0: + return overall_label_to_list_of_contents, results, overall_time, overall_total_reads, total_seconds_for_reads, pd.DataFrame() all_counts_summary_dfs_combined = pd.concat(all_counts_summary_dfs, axis=1) #print(all_counts_summary_dfs_combined.index, all_counts_summary_dfs_combined.columns) overall_count_summary_df = pd.DataFrame.from_dict(all_counts_summary_dfs_combined).sum(axis=1) #print(overall_count_summary_df) - + return overall_label_to_list_of_contents, results, overall_time, overall_total_reads, total_seconds_for_reads, overall_count_summary_df @@ -412,6 +414,11 @@ def gather_edit_information_across_subcontigs(output_folder, barcode_tag='CB', n def add_site_id(all_edit_info): + if len(all_edit_info) == 0: + print("DataFrame is empty. Returning an empty DataFrame.") + all_edit_info = all_edit_info.with_columns(pl.Series(name='site_id', values=[])) + return all_edit_info + return all_edit_info.with_columns( pl.concat_str( [ @@ -442,6 +449,10 @@ def generate_site_level_information(all_edit_info, skip_coverage=False): number_of_edits = len(all_edit_info) all_edit_info = add_site_id(all_edit_info) + + if len(all_edit_info) == 0: + print("DataFrame is empty. Returning an empty DataFrame.") + return all_edit_info identifying_values = ["site_id", "barcode", "contig", "position", "ref", "alt", "strand"]