Skip to content
This repository has been archived by the owner on Jul 18, 2024. It is now read-only.

Commit

Permalink
Daily update of sh16 scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Pathogen Genomics, Wellcome Trust Sanger Institute committed Apr 8, 2014
1 parent baf5215 commit ed98ced
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 62 deletions.
27 changes: 23 additions & 4 deletions BEAST/prepare_BEAST_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def get_user_options():
group = OptionGroup(parser, "Required")
group.add_option("-i", "--input", action="store", dest="inputfile", help="Input file name", default="")
group.add_option("-o", "--output", action="store", dest="outfile", help="Output file name", default="")
group.add_option("-d", "--dates", action="store", dest="dates", help="Dates csv file name (Nmae,date)", default="")
group.add_option("-n", "--nons", action="store_true", dest="nons", help="Exclude sites which are constant other than Ns", default=False)
group.add_option("-g", "--gaps", action="store_true", dest="gaps", help="Gaps (-) are real", default=False)
parser.add_option_group(group)
Expand All @@ -76,11 +77,13 @@ def check_input_validity(options, args):
elif not os.path.isfile(options.inputfile):
DoError('Cannot find file '+options.inputfile)

if options.dates!='' and not os.path.isfile(options.dates):
DoError('Cannot find file '+options.dates)

if options.outfile=='':
options.outfile=options.ref.split("/")[-1].split(".")[0]


options.overwrite=False
while os.path.isfile(options.outfile) and options.overwrite==False:
outopt=""
outopt=raw_input('\nOutput files with chosen prefix already exist.\n\nWould you like to overwrite (o), choose a new output file prefix (n) or quit (Q): ')
Expand Down Expand Up @@ -203,8 +206,21 @@ def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, head
#print sequences.keys()
print "Found", len(sequences.keys()), "sequences of length", alnlen
sys.stdout.flush()


dates={}

if options.dates!="":

for line in open(options.dates):
words=line.strip().split(',')
if len(words)<2:
continue
if words[0] in sequences.keys():

try:
dates[words[0]]=float(words[1])
except ValueError:
continue


snplocations=[]
Expand Down Expand Up @@ -288,8 +304,11 @@ def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, head
# alignment.add_sequence(name, ''.join(snpsequence[name]))
# else:
# print name, "excluded from snp alignment as it is < "+str(options.exclude)+"% mapped"

alignment.add_sequence(name, ''.join(snpsequence[name]))
if name in dates:
alignment.add_sequence(name+"_"+str(dates[name]), ''.join(snpsequence[name]))
else:
alignment.add_sequence(name, ''.join(snpsequence[name]))



AlignIO.write([alignment], open(options.outfile, 'w'), "fasta")
Expand Down
136 changes: 79 additions & 57 deletions better_blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,32 @@ def main():
################################

def check_input_validity(options, args):


options.formatdb=True

if options.query=='':
DoError('No query file selected')
elif not os.path.isfile(options.query):
DoError('Cannot find file '+options.query)
if options.subject=='':
DoError('No subject file selected')
elif not os.path.isfile(options.subject):
DoError('Cannot find file '+options.subject)
if options.subject=="nt":
options.subject="/data/blastdb/Supported/nt"
options.formatdb=False
maxlength=20000
elif options.subject=="nr":
options.subject="/data/blastdb/Supported/nr"
options.blastprog="blastx"
options.formatdb=False
maxlength=20000
elif options.subject=="refseq":
options.subject="/data/blastdb/Supported/refseq"
options.blastprog="blastx"
options.formatdb=False
maxlength=20000
else:
DoError('Cannot find file '+options.subject)
if options.tmpdir=='':
options.tmpdir="better_blast_tmp_dir"

Expand Down Expand Up @@ -133,7 +150,7 @@ def check_input_validity(options, args):
else:
print "Found", len(query_seqs), "query sequences"


mem=0.5
for query in query_seqs:
if ">" in str(query.seq):
print query.seq
Expand Down Expand Up @@ -184,72 +201,76 @@ def check_input_validity(options, args):
query_tmp_file.close()
query_info_file.close()

if options.filter:
filter_string=" -F F "
else:
filter_string=" -F T "

print "Preprocessing subject fasta file"
sys.stdout.flush()

subject_info_file_name=options.tmpdir+"/"+tmpname+".subjectinfo"
subject_info_file=open(subject_info_file_name, "w")

try:
subject_seqs=read_seq_file(options.subject)
except StandardError:
DoError("Cannot read subject file")

try:
subject_seqs=read_seq_file(options.subject)
if len(subject_seqs)==0:
if options.formatdb:
print "Preprocessing subject fasta file"
sys.stdout.flush()

subject_info_file_name=options.tmpdir+"/"+tmpname+".subjectinfo"
subject_info_file=open(subject_info_file_name, "w")

try:
subject_seqs=read_seq_file(options.subject)
except StandardError:
DoError("Cannot read subject file")

try:
subject_seqs=read_seq_file(options.subject)
if len(subject_seqs)==0:
try:
subject_seqs=[open_annotation(options.subject)]
except StandardError:
DoError("Cannot read subject file")

except StandardError:
try:
subject_seqs=[open_annotation(options.subject)]
except StandardError:
DoError("Cannot read subject file")

except StandardError:
try:
subject_seqs=[open_annotation(options.subject)]
except StandardError:
DoError("Cannot read subject file")

if len(subject_seqs)==0:
print DoError("No subject sequence found")
else:
print "Found", len(subject_seqs), "subject sequences"

subject_tot=0
subject_tmp_file_name=options.tmpdir+"/"+tmpname+"_subject.fasta"
subject_tmp_file=open(subject_tmp_file_name, "w")
subject_fragment_count=0
for subject in subject_seqs:
seqlength=len(str(subject.seq))
print >> subject_info_file, '\t'.join(["s"+str(subject_fragment_count), subject.id, str(subject_tot)])
print >> subject_tmp_file, ">s"+str(subject_fragment_count)
print >> subject_tmp_file, str(subject.seq)
subject_tot+=seqlength
subject_fragment_count+=1
subject_tmp_file.close()
if len(subject_seqs)==0:
print DoError("No subject sequence found")
else:
print "Found", len(subject_seqs), "subject sequences"

subject_tot=0
subject_tmp_file_name=options.tmpdir+"/"+tmpname+"_subject.fasta"
subject_tmp_file=open(subject_tmp_file_name, "w")
subject_fragment_count=0
for subject in subject_seqs:
seqlength=len(str(subject.seq))
print >> subject_info_file, '\t'.join(["s"+str(subject_fragment_count), subject.id, str(subject_tot)])
print >> subject_tmp_file, ">s"+str(subject_fragment_count)
print >> subject_tmp_file, str(subject.seq)
subject_tot+=seqlength
subject_fragment_count+=1
subject_tmp_file.close()



print "formatting subject database"
sys.stdout.flush()

formatdb_command="formatdb -p F -i "+subject_tmp_file_name
formatdb_args = shlex.split(formatdb_command)
formatdb_returnval=subprocess.call(formatdb_args)

if options.filter:
filter_string=" -F F "
if formatdb_returnval!=0:
DoError("formatdb command "+formatdb_command+" failed with exit value "+str(formatdb_returnval))
else:
filter_string=" -F T "

print "formatting subject database"
sys.stdout.flush()

formatdb_command="formatdb -p F -i "+subject_tmp_file_name
formatdb_args = shlex.split(formatdb_command)
formatdb_returnval=subprocess.call(formatdb_args)

if formatdb_returnval!=0:
DoError("formatdb command "+formatdb_command+" failed with exit value "+str(formatdb_returnval))

subject_tmp_file_name=options.subject
mem=8

print "Running blast jobs over lsf"
sys.stdout.flush()

blast_command="blastall -p "+options.blastprog+" -i "+query_tmp_file_name+"INDEX -m 8 -e "+str(options.e)+" -o "+options.tmpdir+"/"+tmpname+".blastINDEX -d "+subject_tmp_file_name+filter_string+" "+options.extras

job1 = farm.Bsub(options.tmpdir+"/"+tmpname+"_bb_bsub.out", options.tmpdir+"/"+tmpname+"_bb_bsub.err", tmpname+"_blast", "normal", 0.5, blast_command, start=1, end=query_file_count)
job1 = farm.Bsub(options.tmpdir+"/"+tmpname+"_bb_bsub.out", options.tmpdir+"/"+tmpname+"_bb_bsub.err", tmpname+"_blast", "normal", mem, blast_command, start=1, end=query_file_count)
job1_id = job1.run()

print "Job ID =", job1_id
Expand All @@ -263,8 +284,9 @@ def check_input_validity(options, args):
print "Job ID =", job2_id
sys.stdout.flush()

print "Once these jobs are finished, you can run a comparison in act using:"
print "act", options.subject, options.prefix+".crunch.gz", options.query
if options.formatdb:
print "Once these jobs are finished, you can run a comparison in act using:"
print "act", options.subject, options.prefix+".crunch.gz", options.query

job3 = farm.Bsub(options.prefix+"_bb_bsub.out", options.prefix+"_bb_bsub.err", tmpname+"_cleanup", "normal", 0.5, "rm -rf formatdb.log "+options.tmpdir)
job3.add_dependency(job2_id)
Expand Down
Binary file modified modules/Si_SNPs_temp.pyc
Binary file not shown.
20 changes: 19 additions & 1 deletion prepare_BEAST_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def get_user_options():
group = OptionGroup(parser, "Required")
group.add_option("-i", "--input", action="store", dest="inputfile", help="Input file name", default="")
group.add_option("-o", "--output", action="store", dest="outfile", help="Output file name", default="")
group.add_option("-d", "--dates", action="store", dest="dates", help="Dates csv file name (Nmae,date)", default="")
group.add_option("-n", "--nons", action="store_true", dest="nons", help="Exclude sites which are constant other than Ns", default=False)
group.add_option("-g", "--gaps", action="store_true", dest="gaps", help="Gaps (-) are real", default=False)
parser.add_option_group(group)
Expand All @@ -76,6 +77,8 @@ def check_input_validity(options, args):
elif not os.path.isfile(options.inputfile):
DoError('Cannot find file '+options.inputfile)

if options.dates!='' and not os.path.isfile(options.dates):
DoError('Cannot find file '+options.dates)

if options.outfile=='':
options.outfile=options.ref.split("/")[-1].split(".")[0]
Expand Down Expand Up @@ -140,7 +143,22 @@ def check_input_validity(options, args):
for line in lines:
words=line.strip().split('\n')
sequences[words[0].split()[0]]=''.join(words[1:])




if options.dates!="":
dates={}
for line in open(options.dates):
words=line.strip().split(',')
if len(words)<2:
continue
if words[0] in sequences.keys():

try:
dates[words[0]]=float(words[1])
except TypeError:
continue
print dates

alnlen=len(sequences[sequences.keys()[0]])

Expand Down

0 comments on commit ed98ced

Please sign in to comment.