Skip to content

Commit

Permalink
adjustments for error catching
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Kofman committed Dec 1, 2024
1 parent 00e193b commit 5a10811
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 31 deletions.
25 changes: 18 additions & 7 deletions marine.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix='
"""
Split a BED file into multiple files based on suffixes in the first column.
Each line is assigned to the appropriate file based on the suffix.
e.g.:
10_AAACGAAAGTCACACT-1 6143263 6143264
10_AAACGAAAGTCACACT-1 11912575 11912576
10_AAACGAAAGTCACACT-1 12209751 12209752
10_AAACGAAAGTCACACT-1 13320235 13320236
10_AAACGAAAGTCACACT-1 27036085 27036086
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
Expand All @@ -171,9 +180,10 @@ def split_bed_file(input_bed_file, output_folder, bam_filepaths, output_suffix='
for line in infile:
# Parse the first column to determine the suffix
columns = line.split()

chrom = columns[0] # Assuming the first column is the chromosome
for prefix, suffix in suffix_pairs:
if chrom.startswith(prefix) and chrom.endswith(suffix):
if chrom.startswith(f"{prefix}_") and chrom.endswith(suffix):
file_handles[prefix + suffix].write(line)
break

Expand All @@ -195,8 +205,9 @@ def generate_depths(output_folder, bam_filepaths, paired_end=False, barcode_tag=
"awk 'NR > 1 {{print $2, $4-1, $4}}' OFS='\t' \"$file\"; "
"done | sort -k1,1 -k2,2n -u > {}/combined_source_cells.bed;"
).format(output_folder, output_folder)
run_command(combine_edit_sites_command)

if not os.path.exists(f'{output_folder}/combined_source_cells.bed'):
run_command(combine_edit_sites_command)

all_depth_commands.append(combine_edit_sites_command)

Expand Down Expand Up @@ -447,11 +458,11 @@ def run(bam_filepath, annotation_bedfile_path, output_folder, contigs=[], strand
print("Deleting overall_label_to_list_of_contents...")
del overall_label_to_list_of_contents


with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'total_reads_processed\t{overall_total_reads_processed}\n')
for k, v in overall_counts_summary_df.items():
f.write(f'{k}\t{v}\n')
if not coverage_only:
with open('{}/manifest.txt'.format(logging_folder), 'a+') as f:
f.write(f'total_reads_processed\t{overall_total_reads_processed}\n')
for k, v in overall_counts_summary_df.items():
f.write(f'{k}\t{v}\n')

f.write(f'edits per read (EPR)\t{overall_counts_summary_df.get("total_edits")/overall_total_reads_processed}\n')

Expand Down
53 changes: 30 additions & 23 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,10 +665,10 @@ def generate_and_run_bash_merge(output_folder, file1_path, file2_path, output_fi
bash_command = f"""#!/bin/bash
# Step 1: Adjust the depths file by adding a new column that incorporates both contig and position
# for join purposes, and sort by this column. Output in tab-separated format.
awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4n | uniq > {output_folder}/depth_modified.tsv
awk -v OFS='\\t' '{{print $1, $2+{position_adjustment}, $3, $1":"($2+{position_adjustment})}}' "{file2_path}" | sort -k4,4V | uniq > {output_folder}/depth_modified.tsv
# Step 2: Sort the first file numerically by the join column (the column incuding both contig and position information)
sort -k3,3n "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
sort -k3,3V "{file1_path}" | uniq > {output_folder}/final_edit_info_no_coverage_sorted.tsv
# Step 3: Join the files on the specified columns, output all columns, and select necessary columns with tab separation
join -1 3 -2 4 -t $'\\t' {output_folder}/final_edit_info_no_coverage_sorted.tsv {output_folder}/depth_modified.tsv | awk -v OFS='\\t' '{{print $2, $3, $4, $5, $6, $7, $8, $11}}' > "{output_file_path}"
Expand Down Expand Up @@ -822,27 +822,34 @@ def calculate_coverage(bam_filepath, bed_filepath, output_filepath, output_matri
chrom, start, end = fields[0], int(fields[1]), int(fields[2])
regions.append((chrom, start, end))

print(f"\t{len(regions)} regions")

with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out:
for chrom, start, end in regions:
# Get coverage, applying base quality filter
coverage = bam.count_coverage(
chrom, start, end, quality_threshold=0 # Mimics -Q 13 in samtools depth
)

# Calculate total coverage for each position and include zero-coverage positions
for pos in range(start, end):
total_coverage = sum(base[pos - start] for base in coverage)
out.write(f"{chrom}\t{pos}\t{total_coverage}\n")


if output_matrix_folder:
matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")

if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
# Pivot the depths output file into a matrix
pivot_depths_output(depths_file, matrix_output_file)
if len(regions) > 0:
print(f"\t{len(regions)} regions")

with pysam.AlignmentFile(bam_filepath, "rb") as bam, open(depths_file, "w") as out:
try:
for chrom, start, end in regions:
# Get coverage, applying base quality filter
coverage = bam.count_coverage(
chrom, start, end, quality_threshold=0 # Mimics -Q 13 in samtools depth
)

# Calculate total coverage for each position and include zero-coverage positions
for pos in range(start, end):
total_coverage = sum(base[pos - start] for base in coverage)
out.write(f"{chrom}\t{pos}\t{total_coverage}\n")
except Exception as e:
# Return the exception and the arguments for debugging
raise RuntimeError(
f"Error processing bam {bam_filepath} using bed {bed_filepath} at {chrom}:{start}-{end}, "
f"outputting to {depths_file}: {e}"
)

if output_matrix_folder:
matrix_output_file = os.path.join(output_matrix_folder, f"{os.path.basename(depths_file).split('.')[0]}_matrix.tsv")

if not os.path.exists(matrix_output_file) or os.path.getsize(matrix_output_file) == 0:
# Pivot the depths output file into a matrix
pivot_depths_output(depths_file, matrix_output_file)


def run_pysam_count_coverage(args_list, processes):
Expand Down
2 changes: 1 addition & 1 deletion tests/integration_tests_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ echo "SC tests scripts"
ls -lh $MARINE/tests/$tests_folder/scripts/


for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test"
for t in "only_5_cells_test" "only_5_cells_bulk_mode_test" "only_5_cells_all_cells_coverage_test" "long_read_sc_test" "edge_case_test" "edge_case_dist_filter_test"

do
echo $t
Expand Down

0 comments on commit 5a10811

Please sign in to comment.