Skip to content

Commit

Permalink
more work in shard
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Kofman committed Jul 24, 2024
1 parent 63c2ce4 commit a7565e9
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 24 deletions.
26 changes: 2 additions & 24 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
# Filtering steps
if not final_path_already_exists:
print("Filtering..")

all_edit_info_unique_position_with_coverage_df = pd.read_csv('{}/final_edit_info.tsv'.format(output_folder), sep='\t',
index_col=0,
names=[
Expand All @@ -393,32 +394,9 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], num_in
'coverage'],
dtype={'base_quality': int, 'dist_from_end': int, 'contig': str})

print("Deduplicating....")
distinguishing_columns = [
"barcode",
"contig",
"position",
"ref",
"alt",
"read_id",
"strand",
"mapping_quality",
]
if not skip_coverage:
distinguishing_columns.append("coverage")

all_edit_info_filtered_deduped = all_edit_info_unique_position_with_coverage_df.drop_duplicates(
distinguishing_columns)[distinguishing_columns]

pretty_print("\tNumber of edits after filtering:\n\t{}".format(len(all_edit_info_unique_position_with_coverage_df)))
pretty_print("\tNumber of edits after deduplicating:\n\t{}".format(len(all_edit_info_filtered_deduped)))
all_edit_info_filtered_deduped.to_csv('{}/final_filtered_edit_info.tsv'.format(output_folder),
sep='\t', index=False)

all_edit_info_unique_position_with_coverage_deduped_df = pd.read_csv('{}/final_filtered_edit_info.tsv'.format(output_folder), sep='\t', dtype={"contig": str})


all_edit_info_filtered_pl = pl.from_pandas(all_edit_info_unique_position_with_coverage_deduped_df)
all_edit_info_filtered_pl = pl.from_pandas(all_edit_info_unique_position_with_coverage_df)

final_site_level_information_df = generate_site_level_information(all_edit_info_filtered_pl, skip_coverage=skip_coverage)
pretty_print("\tNumber of unique edit sites:\n\t{}".format(len(final_site_level_information_df)))
Expand Down
16 changes: 16 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ def filter_output_df(output_df, filters, output_filename):
"base_quality",
"mapping_quality"
]


all_edit_info_unique_position_df = filtered_output_df.drop_duplicates(distinguishing_columns)[distinguishing_columns]

Expand All @@ -430,8 +431,23 @@ def filter_output_df(output_df, filters, output_filename):
read_edits = all_edit_info_unique_position_with_coverage_df.groupby('read_id').count().sort_values('barcode')
all_edit_info_unique_position_with_coverage_df = all_edit_info_unique_position_with_coverage_df[all_edit_info_unique_position_with_coverage_df.read_id.isin(read_edits[read_edits['barcode'] <= max_edits_per_read].index)]

distinguishing_columns = [
"barcode",
"contig",
"position",
"ref",
"alt",
"read_id",
"strand",
"mapping_quality",
]

all_edit_info_unique_position_with_coverage_df = all_edit_info_unique_position_with_coverage_df.drop_duplicates(
distinguishing_columns)[distinguishing_columns]

filter_stats['filtered'] = len(all_edit_info_unique_position_with_coverage_df)


all_edit_info_unique_position_with_coverage_df.to_csv(output_filename, sep='\t', header=False)

return filter_stats
Expand Down

0 comments on commit a7565e9

Please sign in to comment.