From 5a108114f4045d314f272d8eaf468bfa4a60cf9f Mon Sep 17 00:00:00 2001 From: Eric Kofman Date: Sat, 30 Nov 2024 18:13:53 -0800 Subject: [PATCH] adjustments for error catching --- marine.py | 25 +++++++++++----- src/utils.py | 53 +++++++++++++++++++--------------- tests/integration_tests_run.sh | 2 +- 3 files changed, 49 insertions(+), 31 deletions(-) diff --git a/marine.py b/marine.py index 3e85b43..a17b1a8 100755 --- a/marine.py +++ b/marine.py @@ -149,6 +149,15 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix=' """ Split a BED file into multiple files based on suffixes in the first column. Each line is assigned to the appropriate file based on the suffix. + + e.g.: + + 10_AAACGAAAGTCACACT-1 6143263 6143264 + 10_AAACGAAAGTCACACT-1 11912575 11912576 + 10_AAACGAAAGTCACACT-1 12209751 12209752 + 10_AAACGAAAGTCACACT-1 13320235 13320236 + 10_AAACGAAAGTCACACT-1 27036085 27036086 + """ if not os.path.exists(output_folder): os.makedirs(output_folder) @@ -171,9 +180,10 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix=' for line in infile: # Parse the first column to determine the suffix columns = line.split() + chrom = columns[0] # Assuming the first column is the chromosome for prefix, suffix in suffix_pairs: - if chrom.startswith(prefix) and chrom.endswith(suffix): + if chrom.startswith(f"{prefix}_") and chrom.endswith(suffix): file_handles[prefix + suffix].write(line) break @@ -195,8 +205,9 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag= "awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; " "done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;" ).format(output_folder, output_folder) - run_command(combine_edit_sites_command) + if not os.path.exists(f'{output_folder}/combined_source_cells.bed'): + run_command(combine_edit_sites_command) all_depth_commands.append(combine_edit_sites_command) @@ -447,11 +458,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand print("Deleting overall_label_to_list_of_contents...") del overall_label_to_list_of_contents - - with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: - f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') - for k, v in overall_counts_summary_df.items(): - f.write(f'{k}\t{v}\n') + if not coverage_only: + with open('{}/manifest.txt'.format(logging_folder), 'a+') as f: + f.write(f'total_reads_processed\t{overall_total_reads_processed}\n') + for k, v in overall_counts_summary_df.items(): + f.write(f'{k}\t{v}\n') f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n') diff --git a/src/utils.py b/src/utils.py index 7347c14..d005bde 100644 --- a/src/utils.py +++ b/src/utils.py @@ -665,10 +665,10 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi bash_command = f"""#!/bin/bash # Step 1: Adjust the depths file by adding a new column that incorporates both contig and position # for join purposes, and sort by this column. Output in tab-separated format. - awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv + awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4V | uniq > {output_folder}/depth_modified.tsv # Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information) - sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv + sort -k3,3V "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv # Step 3: Join the files on the specified columns, output all columns, and select necessary columns with tab separation join -1 3 -2 4 -t $'\\t' {output_folder}/final_edit_info_no_coverage_sorted.tsv {output_folder}/depth_modified.tsv | awk -v OFS='\\t' '{{print $2, $3, $4, $5, $6, $7, $8, $11}}' > "{output_file_path}" @@ -822,27 +822,34 @@ def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matri chrom, start, end = fields[0], int(fields[1]), int(fields[2]) regions.append((chrom, start, end)) - print(f"\t{len(regions)} regions") - - with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out: - for chrom, start, end in regions: - # Get coverage, applying base quality filter - coverage = bam.count_coverage( - chrom, start, end, quality_threshold=0 # Mimics -Q 13 in samtools depth - ) - - # Calculate total coverage for each position and include zero-coverage positions - for pos in range(start, end): - total_coverage = sum(base[pos - start] for base in coverage) - out.write(f"{chrom}\t{pos}\t{total_coverage}\n") - - - if output_matrix_folder: - matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv") - - if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0: - # Pivot the depths output file into a matrix - pivot_depths_output(depths_file, matrix_output_file) + if len(regions) > 0: + print(f"\t{len(regions)} regions") + + with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out: + try: + for chrom, start, end in regions: + # Get coverage, applying base quality filter + coverage = bam.count_coverage( + chrom, start, end, quality_threshold=0 # Mimics -Q 13 in samtools depth + ) + + # Calculate total coverage for each position and include zero-coverage positions + for pos in range(start, end): + total_coverage = sum(base[pos - start] for base in coverage) + out.write(f"{chrom}\t{pos}\t{total_coverage}\n") + except Exception as e: + # Return the exception and the arguments for debugging + raise RuntimeError( + f"Error processing bam {bam_filepath} using bed {bed_filepath} at {chrom}:{start}-{end}, " + f"outputting to {depths_file}: {e}" + ) + + if output_matrix_folder: + matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv") + + if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0: + # Pivot the depths output file into a matrix + pivot_depths_output(depths_file, matrix_output_file) def run_pysam_count_coverage(args_list, processes): diff --git a/tests/integration_tests_run.sh b/tests/integration_tests_run.sh index f7ab6be..3d10906 100755 --- a/tests/integration_tests_run.sh +++ b/tests/integration_tests_run.sh @@ -34,7 +34,7 @@ echo "SC tests scripts" ls -lh $MARINE/tests/$tests_folder/scripts/ -for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test" +for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "only_5_cells_all_cells_coverage_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test" do echo $t