Daily update of sh16 scripts

sanger-pathogens · Apr 8, 2014 · ed98ced · ed98ced
1 parent baf5215
commit ed98ced
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 62 deletions.
diff --git a/BEAST/prepare_BEAST_alignment.py b/BEAST/prepare_BEAST_alignment.py
@@ -57,6 +57,7 @@ def get_user_options():
 	group = OptionGroup(parser, "Required")
 	group.add_option("-i", "--input", action="store", dest="inputfile", help="Input file name", default="")
 	group.add_option("-o", "--output", action="store", dest="outfile", help="Output file name", default="")
+	group.add_option("-d", "--dates", action="store", dest="dates", help="Dates csv file name (Nmae,date)", default="")
 	group.add_option("-n", "--nons", action="store_true", dest="nons", help="Exclude sites which are constant other than Ns", default=False)
 	group.add_option("-g", "--gaps", action="store_true", dest="gaps", help="Gaps (-) are real", default=False)
 	parser.add_option_group(group)
@@ -76,11 +77,13 @@ def check_input_validity(options, args):
 	elif not os.path.isfile(options.inputfile):
 		DoError('Cannot find file '+options.inputfile)
 
+	if options.dates!='' and not os.path.isfile(options.dates):
+		DoError('Cannot find file '+options.dates)
 
 	if options.outfile=='':
 		options.outfile=options.ref.split("/")[-1].split(".")[0]
 
-
+	options.overwrite=False
 	while os.path.isfile(options.outfile) and options.overwrite==False:
 		outopt=""
 		outopt=raw_input('\nOutput files with chosen prefix already exist.\n\nWould you like to overwrite (o), choose a new output file prefix (n) or quit (Q): ')
@@ -203,8 +206,21 @@ def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, head
 	#print sequences.keys()
 	print "Found", len(sequences.keys()), "sequences of length", alnlen
 	sys.stdout.flush()
-
 
+	dates={}
+
+	if options.dates!="":
+
+		for line in open(options.dates):
+			words=line.strip().split(',')
+			if len(words)<2:
+				continue
+			if words[0] in sequences.keys():
+
+				try:
+					dates[words[0]]=float(words[1])
+				except ValueError:
+					continue
 
 
 	snplocations=[]
@@ -288,8 +304,11 @@ def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, head
 #			alignment.add_sequence(name, ''.join(snpsequence[name]))
 #		else:
 #			print name, "excluded from snp alignment as it is < "+str(options.exclude)+"% mapped"
-
-		alignment.add_sequence(name, ''.join(snpsequence[name]))
+		if name in dates:
+			alignment.add_sequence(name+"_"+str(dates[name]), ''.join(snpsequence[name]))
+		else:
+			alignment.add_sequence(name, ''.join(snpsequence[name]))
+
 
 
 	AlignIO.write([alignment], open(options.outfile, 'w'), "fasta")

diff --git a/better_blast.py b/better_blast.py
@@ -54,15 +54,32 @@ def main():
 ################################
 
 def check_input_validity(options, args):
-
+
+	options.formatdb=True
+
 	if options.query=='':
 		DoError('No query file selected')
 	elif not os.path.isfile(options.query):
 		DoError('Cannot find file '+options.query)
 	if options.subject=='':
 		DoError('No subject file selected')
 	elif not os.path.isfile(options.subject):
-		DoError('Cannot find file '+options.subject)
+		if options.subject=="nt":
+			options.subject="/data/blastdb/Supported/nt"
+			options.formatdb=False
+			maxlength=20000
+		elif options.subject=="nr":
+			options.subject="/data/blastdb/Supported/nr"
+			options.blastprog="blastx"
+			options.formatdb=False
+			maxlength=20000
+		elif options.subject=="refseq":
+			options.subject="/data/blastdb/Supported/refseq"
+			options.blastprog="blastx"
+			options.formatdb=False
+			maxlength=20000
+		else:	
+			DoError('Cannot find file '+options.subject)
 	if options.tmpdir=='':
 		options.tmpdir="better_blast_tmp_dir"
 
@@ -133,7 +150,7 @@ def check_input_validity(options, args):
 	else:
 		print "Found", len(query_seqs), "query sequences"
 
-
+	mem=0.5
 	for query in query_seqs:
 		if ">" in str(query.seq):
 			print query.seq
@@ -184,72 +201,76 @@ def check_input_validity(options, args):
 	query_tmp_file.close()
 	query_info_file.close()
 
+	if options.filter:
+		filter_string=" -F F "
+	else:
+		filter_string=" -F T "
 
-	print "Preprocessing subject fasta file"
-	sys.stdout.flush()
-
-	subject_info_file_name=options.tmpdir+"/"+tmpname+".subjectinfo"
-	subject_info_file=open(subject_info_file_name, "w")
-
-	try:
-		subject_seqs=read_seq_file(options.subject)
-	except StandardError:
-		DoError("Cannot read subject file")
-
-	try:
-		subject_seqs=read_seq_file(options.subject)
-		if len(subject_seqs)==0:
+	if options.formatdb:
+		print "Preprocessing subject fasta file"
+		sys.stdout.flush()
+
+		subject_info_file_name=options.tmpdir+"/"+tmpname+".subjectinfo"
+		subject_info_file=open(subject_info_file_name, "w")
+
+		try:
+			subject_seqs=read_seq_file(options.subject)
+		except StandardError:
+			DoError("Cannot read subject file")
+
+		try:
+			subject_seqs=read_seq_file(options.subject)
+			if len(subject_seqs)==0:
+				try:
+					subject_seqs=[open_annotation(options.subject)]
+				except StandardError:
+					DoError("Cannot read subject file")
+
+		except StandardError:
 			try:
 				subject_seqs=[open_annotation(options.subject)]
 			except StandardError:
 				DoError("Cannot read subject file")
 
-	except StandardError:
-		try:
-			subject_seqs=[open_annotation(options.subject)]
-		except StandardError:
-			DoError("Cannot read subject file")
-
-	if len(subject_seqs)==0:
-		print DoError("No subject sequence found")
-	else:
-		print "Found", len(subject_seqs), "subject sequences"
-
-	subject_tot=0
-	subject_tmp_file_name=options.tmpdir+"/"+tmpname+"_subject.fasta"
-	subject_tmp_file=open(subject_tmp_file_name, "w")
-	subject_fragment_count=0
-	for subject in subject_seqs:
-		seqlength=len(str(subject.seq))
-		print >> subject_info_file, '\t'.join(["s"+str(subject_fragment_count), subject.id, str(subject_tot)])
-		print >> subject_tmp_file, ">s"+str(subject_fragment_count)
-		print >> subject_tmp_file, str(subject.seq)
-		subject_tot+=seqlength
-		subject_fragment_count+=1
-	subject_tmp_file.close()
+		if len(subject_seqs)==0:
+			print DoError("No subject sequence found")
+		else:
+			print "Found", len(subject_seqs), "subject sequences"
+
+		subject_tot=0
+		subject_tmp_file_name=options.tmpdir+"/"+tmpname+"_subject.fasta"
+		subject_tmp_file=open(subject_tmp_file_name, "w")
+		subject_fragment_count=0
+		for subject in subject_seqs:
+			seqlength=len(str(subject.seq))
+			print >> subject_info_file, '\t'.join(["s"+str(subject_fragment_count), subject.id, str(subject_tot)])
+			print >> subject_tmp_file, ">s"+str(subject_fragment_count)
+			print >> subject_tmp_file, str(subject.seq)
+			subject_tot+=seqlength
+			subject_fragment_count+=1
+		subject_tmp_file.close()
+
+
+
+		print "formatting subject database"
+		sys.stdout.flush()
+
+		formatdb_command="formatdb -p F -i "+subject_tmp_file_name
+		formatdb_args = shlex.split(formatdb_command)
+		formatdb_returnval=subprocess.call(formatdb_args)
 
-	if options.filter:
-		filter_string=" -F F "
+		if formatdb_returnval!=0:
+			DoError("formatdb command "+formatdb_command+" failed with exit value "+str(formatdb_returnval))
 	else:
-		filter_string=" -F T "
-
-	print "formatting subject database"
-	sys.stdout.flush()
-
-	formatdb_command="formatdb -p F -i "+subject_tmp_file_name
-	formatdb_args = shlex.split(formatdb_command)
-	formatdb_returnval=subprocess.call(formatdb_args)
-
-	if formatdb_returnval!=0:
-		DoError("formatdb command "+formatdb_command+" failed with exit value "+str(formatdb_returnval))
-
+		subject_tmp_file_name=options.subject
+		mem=8
 
 	print "Running blast jobs over lsf"
 	sys.stdout.flush()
 
 	blast_command="blastall -p "+options.blastprog+" -i "+query_tmp_file_name+"INDEX -m 8 -e "+str(options.e)+" -o "+options.tmpdir+"/"+tmpname+".blastINDEX -d "+subject_tmp_file_name+filter_string+" "+options.extras
 
-	job1 = farm.Bsub(options.tmpdir+"/"+tmpname+"_bb_bsub.out", options.tmpdir+"/"+tmpname+"_bb_bsub.err", tmpname+"_blast", "normal", 0.5, blast_command, start=1, end=query_file_count)
+	job1 = farm.Bsub(options.tmpdir+"/"+tmpname+"_bb_bsub.out", options.tmpdir+"/"+tmpname+"_bb_bsub.err", tmpname+"_blast", "normal", mem, blast_command, start=1, end=query_file_count)
 	job1_id = job1.run()
 
 	print "Job ID =", job1_id
@@ -263,8 +284,9 @@ def check_input_validity(options, args):
 	print "Job ID =", job2_id
 	sys.stdout.flush()
 
-	print "Once these jobs are finished, you can run a comparison in act using:"
-	print "act", options.subject, options.prefix+".crunch.gz", options.query
+	if options.formatdb:
+		print "Once these jobs are finished, you can run a comparison in act using:"
+		print "act", options.subject, options.prefix+".crunch.gz", options.query
 
 	job3 = farm.Bsub(options.prefix+"_bb_bsub.out", options.prefix+"_bb_bsub.err", tmpname+"_cleanup", "normal", 0.5, "rm -rf formatdb.log "+options.tmpdir)
 	job3.add_dependency(job2_id) 

diff --git a/modules/Si_SNPs_temp.pyc b/modules/Si_SNPs_temp.pyc
diff --git a/prepare_BEAST_alignment.py b/prepare_BEAST_alignment.py
@@ -57,6 +57,7 @@ def get_user_options():
 	group = OptionGroup(parser, "Required")
 	group.add_option("-i", "--input", action="store", dest="inputfile", help="Input file name", default="")
 	group.add_option("-o", "--output", action="store", dest="outfile", help="Output file name", default="")
+	group.add_option("-d", "--dates", action="store", dest="dates", help="Dates csv file name (Nmae,date)", default="")
 	group.add_option("-n", "--nons", action="store_true", dest="nons", help="Exclude sites which are constant other than Ns", default=False)
 	group.add_option("-g", "--gaps", action="store_true", dest="gaps", help="Gaps (-) are real", default=False)
 	parser.add_option_group(group)
@@ -76,6 +77,8 @@ def check_input_validity(options, args):
 	elif not os.path.isfile(options.inputfile):
 		DoError('Cannot find file '+options.inputfile)
 
+	if options.dates!='' and not os.path.isfile(options.dates):
+		DoError('Cannot find file '+options.dates)
 
 	if options.outfile=='':
 		options.outfile=options.ref.split("/")[-1].split(".")[0]
@@ -140,7 +143,22 @@ def check_input_validity(options, args):
 	for line in lines:
 		words=line.strip().split('\n')
 		sequences[words[0].split()[0]]=''.join(words[1:])
-
+
+
+
+	if options.dates!="":
+		dates={}
+		for line in open(options.dates):
+			words=line.strip().split(',')
+			if len(words)<2:
+				continue
+			if words[0] in sequences.keys():
+
+				try:
+					dates[words[0]]=float(words[1])
+				except TypeError:
+					continue
+	print dates
 
 	alnlen=len(sequences[sequences.keys()[0]])