From a7565e9baca8ef8f25f71ddeef321250e9f54a5a Mon Sep 17 00:00:00 2001 From: Eric Kofman Date: Wed, 24 Jul 2024 16:13:46 -0700 Subject: [PATCH] more work in shard --- marine.py | 26 ++------------------------ src/utils.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/marine.py b/marine.py index 85ada2b..4fd47b6 100755 --- a/marine.py +++ b/marine.py @@ -385,6 +385,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in # Filtering steps if not final_path_already_exists: print("Filtering..") + all_edit_info_unique_position_with_coverage_df = pd.read_csv('{}/final_edit_info.tsv'.format(output_folder), sep='\t', index_col=0, names=[ @@ -393,32 +394,9 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in 'coverage'], dtype={'base_quality': int, 'dist_from_end': int, 'contig': str}) - print("Deduplicating....") - distinguishing_columns = [ - "barcode", - "contig", - "position", - "ref", - "alt", - "read_id", - "strand", - "mapping_quality", - ] - if not skip_coverage: - distinguishing_columns.append("coverage") - - all_edit_info_filtered_deduped = all_edit_info_unique_position_with_coverage_df.drop_duplicates( - distinguishing_columns)[distinguishing_columns] - pretty_print("\tNumber of edits after filtering:\n\t{}".format(len(all_edit_info_unique_position_with_coverage_df))) - pretty_print("\tNumber of edits after deduplicating:\n\t{}".format(len(all_edit_info_filtered_deduped))) - all_edit_info_filtered_deduped.to_csv('{}/final_filtered_edit_info.tsv'.format(output_folder), - sep='\t', index=False) - - all_edit_info_unique_position_with_coverage_deduped_df = pd.read_csv('{}/final_filtered_edit_info.tsv'.format(output_folder), sep='\t', dtype={"contig": str}) - - all_edit_info_filtered_pl = pl.from_pandas(all_edit_info_unique_position_with_coverage_deduped_df) + all_edit_info_filtered_pl = pl.from_pandas(all_edit_info_unique_position_with_coverage_df) final_site_level_information_df = generate_site_level_information(all_edit_info_filtered_pl, skip_coverage=skip_coverage) pretty_print("\tNumber of unique edit sites:\n\t{}".format(len(final_site_level_information_df))) diff --git a/src/utils.py b/src/utils.py index eb6aad5..5959363 100644 --- a/src/utils.py +++ b/src/utils.py @@ -417,6 +417,7 @@ def filter_output_df(output_df, filters, output_filename): "base_quality", "mapping_quality" ] + all_edit_info_unique_position_df = filtered_output_df.drop_duplicates(distinguishing_columns)[distinguishing_columns] @@ -430,8 +431,23 @@ def filter_output_df(output_df, filters, output_filename): read_edits = all_edit_info_unique_position_with_coverage_df.groupby('read_id').count().sort_values('barcode') all_edit_info_unique_position_with_coverage_df = all_edit_info_unique_position_with_coverage_df[all_edit_info_unique_position_with_coverage_df.read_id.isin(read_edits[read_edits['barcode'] <= max_edits_per_read].index)] + distinguishing_columns = [ + "barcode", + "contig", + "position", + "ref", + "alt", + "read_id", + "strand", + "mapping_quality", + ] + + all_edit_info_unique_position_with_coverage_df = all_edit_info_unique_position_with_coverage_df.drop_duplicates( + distinguishing_columns)[distinguishing_columns] + filter_stats['filtered'] = len(all_edit_info_unique_position_with_coverage_df) + all_edit_info_unique_position_with_coverage_df.to_csv(output_filename, sep='\t', header=False) return filter_stats