Skip to content

Commit

Permalink
Add customizable samtools exclude flag (#112) (#503)
Browse files Browse the repository at this point in the history
* Add customizable samtools exclude flag

This adds a new parameter, `--samtools_exclude_flags` that allows one to
customize which reads are excluded from CRISPRessoWGS and CRISPRessoPooled runs.

* Allow for user filtering of reads in Pooled when calculating n_aligned

* Add parameter for CRISPRessoCORE to exclude reads

* Fix WGS not having access to args

* Update pooled samtools exclude flags to be bitwise OR and add unit tests

* Point to new test branch

* Update help message for samtools_exclude_flags

* Move integration test branch back to master
  • Loading branch information
Colelyman authored Nov 22, 2024
1 parent ecac07a commit 6a363b7
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 20 deletions.
4 changes: 2 additions & 2 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,9 +819,9 @@ def process_bam(bam_filename, bam_chr_loc, output_bam, variantCache, ref_names,
crispresso_cmd_to_write = ' '.join(sys.argv)
sam_out.write('@PG\tID:crispresso2\tPN:crispresso2\tVN:'+CRISPRessoShared.__version__+'\tCL:"'+crispresso_cmd_to_write+'"\n')
if bam_chr_loc != "":
proc = sb.Popen(['samtools', 'view', bam_filename, bam_chr_loc], stdout=sb.PIPE, encoding='utf-8')
proc = sb.Popen(['samtools', 'view', '-F', args.samtools_exclude_flags, bam_filename, bam_chr_loc], stdout=sb.PIPE, encoding='utf-8')
else:
proc = sb.Popen(['samtools', 'view', bam_filename], stdout=sb.PIPE, encoding='utf-8')
proc = sb.Popen(['samtools', 'view', '-F', args.samtools_exclude_flags, bam_filename], stdout=sb.PIPE, encoding='utf-8')
num_reads = 0

# Reading through the bam file and enriching variantCache as a dictionary with the following:
Expand Down
37 changes: 24 additions & 13 deletions CRISPResso2/CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,22 @@ def get_n_reads_bam(bam_filename):
p = sb.Popen("samtools view -c %s" % bam_filename, shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])

def get_n_aligned_bam(bam_filename):
p = sb.Popen("samtools view -F 0x904 -c %s" % bam_filename, shell=True, stdout=sb.PIPE)
def calculate_aligned_samtools_exclude_flags(samtools_exclude_flags):
"""Calculate the samtools exclude flags for aligned reads.
This function calculates the samtools exclude flags for aligned reads
by filtering 0x900 (not primary alignment and supplementary alignment)
and also including any other user specified filters.
"""
samtools_exclude_flags = int(samtools_exclude_flags, base=16)
return hex(0x900 | samtools_exclude_flags)

def get_n_aligned_bam(bam_filename, samtools_exclude_flags):
p = sb.Popen(f"samtools view -F {calculate_aligned_samtools_exclude_flags(samtools_exclude_flags)} -c {bam_filename}", shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])

def get_n_aligned_bam_region(bam_filename, chr_name, chr_start, chr_end):
p = sb.Popen("samtools view -F 0x904 -c %s %s:%d-%d" %(bam_filename, chr_name, chr_start, chr_end), shell=True, stdout=sb.PIPE)
def get_n_aligned_bam_region(bam_filename, chr_name, chr_start, chr_end, samtools_exclude_flags):
p = sb.Popen(f"samtools view -F {calculate_aligned_samtools_exclude_flags(samtools_exclude_flags)} -c {bam_filename} {chr_name}:{chr_start}-{chr_end}", shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])

def find_overlapping_genes(row, df_genes):
Expand Down Expand Up @@ -786,12 +796,13 @@ def main():
info('Alignment command: ' + aligner_command, {'percent_complete': 15})
sb.call(aligner_command, shell=True)

N_READS_ALIGNED = get_n_aligned_bam(bam_filename_amplicons)
N_READS_ALIGNED = get_n_aligned_bam(bam_filename_amplicons, args.samtools_exclude_flags)

if args.limit_open_files_for_demux:
bam_iter = CRISPRessoShared.get_command_output(
'(samtools sort {bam_file} | samtools view -F 4) 2>> {log_file}'.format(
'(samtools sort {bam_file} | samtools view -F {samtools_exclude_flags}) 2>> {log_file}'.format(
bam_file=bam_filename_amplicons,
samtools_exclude_flags=args.samtools_exclude_flags,
log_file=log_filename,
),
)
Expand Down Expand Up @@ -824,7 +835,7 @@ def main():
if curr_file is not None:
curr_file.close()
else:
s1 = r"samtools view -F 4 %s 2>>%s | grep -v ^'@'" % (bam_filename_amplicons,log_filename)
s1 = rf"samtools view -F {args.samtools_exclude_flags} {bam_filename_amplicons} 2>>{log_filename} | grep -v ^'@'"
s2 = r'''|awk '{ gzip_filename=sprintf("gzip >> OUTPUTPATH%s.fastq.gz",$3);\
print "@"$1"\n"$10"\n+\n"$11 | gzip_filename;}' '''

Expand Down Expand Up @@ -1014,7 +1025,7 @@ def rreplace(s, old, new):
info('Index file for input .bam file does not exist. Generating bam index file.')
sb.call('samtools index %s' % bam_filename_genome, shell=True)

N_READS_ALIGNED = get_n_aligned_bam(bam_filename_genome)
N_READS_ALIGNED = get_n_aligned_bam(bam_filename_genome, args.samtools_exclude_flags)
# save progress up to this point
crispresso2_info['running_info']['finished_steps']['n_reads_aligned_genome'] = N_READS_ALIGNED
CRISPRessoShared.write_crispresso_info(
Expand All @@ -1031,7 +1042,7 @@ def rreplace(s, old, new):

sb.call('samtools index %s' % bam_filename_genome, shell=True)

N_READS_ALIGNED = get_n_aligned_bam(bam_filename_genome)
N_READS_ALIGNED = get_n_aligned_bam(bam_filename_genome, args.samtools_exclude_flags)

# save progress up to this point
crispresso2_info['running_info']['finished_steps']['n_reads_aligned_genome'] = N_READS_ALIGNED
Expand Down Expand Up @@ -1061,7 +1072,7 @@ def rreplace(s, old, new):

# if we should only demultiplex where amplicons aligned... (as opposed to the whole genome)
if RUNNING_MODE=='AMPLICONS_AND_GENOME' and not args.demultiplex_genome_wide:
s1 = r'''samtools view -F 0x0004 %s __REGIONCHR__:__REGIONSTART__-__REGIONEND__ 2>>%s |''' % (bam_filename_genome, log_filename)+\
s1 = rf'''samtools view -F {args.samtools_exclude_flags} {bam_filename_genome} __REGIONCHR__:__REGIONSTART__-__REGIONEND__ 2>>{log_filename} |''' +\
r'''awk 'BEGIN{OFS="\t";num_records=0;fastq_filename="__OUTPUTPATH__REGION___REGIONCHR_____REGIONSTART_____REGIONEND__.fastq";} \
{ \
print "@"$1"\n"$10"\n+\n"$11 > fastq_filename; \
Expand Down Expand Up @@ -1093,7 +1104,7 @@ def rreplace(s, old, new):
else:
# next, create the general demux command
# variables like __CHR__ will be subbed out below for each iteration
s1 = r'''samtools view -F 0x0004 %s __CHR____REGION__ 2>>%s |''' % (bam_filename_genome, log_filename) + \
s1 = rf'''samtools view -F {args.samtools_exclude_flags} {bam_filename_genome} __CHR____REGION__ 2>>{log_filename} |''' + \
r'''awk 'BEGIN {OFS="\t"} {bpstart=$4; bpend=bpstart; split ($6,a,"[MIDNSHP]"); n=0;\
for (i=1; i in a; i++){\
n+=1+length(a[i]);\
Expand Down Expand Up @@ -1166,13 +1177,13 @@ def rreplace(s, old, new):
curr_end = curr_pos + chr_step_size
while curr_end < chr_len:
# make sure there aren't any reads at this breakpoint
n_reads_at_end = get_n_aligned_bam_region(bam_filename_genome, chr_str, curr_end-5, curr_end+5)
n_reads_at_end = get_n_aligned_bam_region(bam_filename_genome, chr_str, curr_end-5, curr_end+5, args.samtools_exclude_flags)
while n_reads_at_end > 0:
curr_end += 500 # look for another place with no reads
if curr_end >= chr_len:
curr_end = chr_len
break
n_reads_at_end = get_n_aligned_bam_region(bam_filename_genome, chr_str, curr_end-5, curr_end+5)
n_reads_at_end = get_n_aligned_bam_region(bam_filename_genome, chr_str, curr_end-5, curr_end+5, args.samtools_exclude_flags)

chr_output_filename = _jp('MAPPED_REGIONS/%s_%s_%s.info' % (chr_str, curr_pos, curr_end))
sub_chr_command = chr_cmd.replace("__REGION__", ":%d-%d "%(curr_pos, curr_end)).replace("__DEMUX_CHR_LOGFILENAME__",chr_output_filename)
Expand Down
11 changes: 6 additions & 5 deletions CRISPResso2/CRISPRessoWGSCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def get_n_reads_fastq(fastq_filename):
n_reads = int(float(p.communicate()[0])/4.0)
return n_reads

def extract_reads(row):
def extract_reads(row, samtools_exclude_flags):
if row.sequence:
#create place-holder fastq files
open(row.fastq_file_trimmed_reads_in_region, 'w+').close()
Expand All @@ -217,7 +217,7 @@ def extract_reads(row):

info('Extracting reads in:%s and creating .bam file: %s' % (region, row.bam_file_with_reads_in_region))

cmd=r'''samtools view -b -F 4 --reference %s %s %s > %s ''' % (row.reference_file, row.original_bam, region, row.bam_file_with_reads_in_region)
cmd = rf'''samtools view -b -F {samtools_exclude_flags} --reference {row.reference_file} {row.original_bam} {region} > {row.bam_file_with_reads_in_region} '''
sb.call(cmd, shell=True)

cmd=r'''samtools index %s ''' % (row.bam_file_with_reads_in_region)
Expand All @@ -232,10 +232,10 @@ def extract_reads(row):

return row

def extract_reads_chunk(df):
def extract_reads_chunk(df, samtools_exclude_flags):
new_df = pd.DataFrame(columns=df.columns)
for i in range(len(df)):
new_df.loc[i] = extract_reads(df.iloc[i].copy())
new_df.loc[i] = extract_reads(df.iloc[i].copy(), samtools_exclude_flags)
new_df.set_index(df.index,inplace=True)
return new_df

Expand Down Expand Up @@ -577,7 +577,8 @@ def set_filenames(row):

else:
#run region extraction here
df_regions = CRISPRessoMultiProcessing.run_pandas_apply_parallel(df_regions, extract_reads_chunk, n_processes_for_wgs)
extract_reads_chunk_partial = lambda x: extract_reads_chunk(x, args.samtools_exclude_flags)
df_regions = CRISPRessoMultiProcessing.run_pandas_apply_parallel(df_regions, extract_reads_chunk_partial, n_processes_for_wgs)
df_regions.sort_values('region_number', inplace=True)
cols_to_print = ["chr_id", "bpstart", "bpend", "sgRNA", "Expected_HDR", "Coding_sequence", "sequence", "n_reads", "bam_file_with_reads_in_region", "fastq_file_trimmed_reads_in_region"]
if args.gene_annotations:
Expand Down
14 changes: 14 additions & 0 deletions CRISPResso2/args.json
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,20 @@
"default": "None",
"tools": ["Core", "Batch", "Pooled", "WGS"]
},
"samtools_exclude_flags": {
"keys": ["--samtools_exclude_flags"],
"help": "Exclude reads with any of the specified flags set in the SAM/BAM file. Flags can be specified in either base 16 (hex) or base 10. Default is 4 (read unmapped).",
"type": "str",
"default": "4",
"tools": ["Pooled", "WGS"]
},
"samtools_exclude_flags_core": {
"keys": ["--samtools_exclude_flags"],
"help": "Exclude reads with any of the specified flags set in the SAM/BAM file. Flags can be specified in either base 16 (hex) or base 10. Default is 0 (no reads filtered).",
"type": "str",
"default": "0",
"tools": ["Core"]
},
"stringent_flash_merging": {
"keys": ["--stringent_flash_merging"],
"help": "DEPRECATED in v2.3.0",
Expand Down
17 changes: 17 additions & 0 deletions tests/unit_tests/test_CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from CRISPResso2 import CRISPRessoPooledCORE


def test_calculate_aligned_samtools_exclude_flags():
assert CRISPRessoPooledCORE.calculate_aligned_samtools_exclude_flags('0') == hex(0x900)


def test_calculate_aligned_samtools_exclude_flags_4():
assert CRISPRessoPooledCORE.calculate_aligned_samtools_exclude_flags('4') == hex(0x904)


def test_calculate_aligned_samtools_exclude_flags_9():
assert CRISPRessoPooledCORE.calculate_aligned_samtools_exclude_flags('9') == hex(0x909)


def test_calculate_aligned_samtools_exclude_flags_0x100():
assert CRISPRessoPooledCORE.calculate_aligned_samtools_exclude_flags('0x100') == hex(0x900)

0 comments on commit 6a363b7

Please sign in to comment.