diff --git a/BEAST/prepare_BEAST_alignment.py b/BEAST/prepare_BEAST_alignment.py index 40c90a7..6bf296a 100755 --- a/BEAST/prepare_BEAST_alignment.py +++ b/BEAST/prepare_BEAST_alignment.py @@ -57,6 +57,7 @@ def get_user_options(): group = OptionGroup(parser, "Required") group.add_option("-i", "--input", action="store", dest="inputfile", help="Input file name", default="") group.add_option("-o", "--output", action="store", dest="outfile", help="Output file name", default="") + group.add_option("-d", "--dates", action="store", dest="dates", help="Dates csv file name (Nmae,date)", default="") group.add_option("-n", "--nons", action="store_true", dest="nons", help="Exclude sites which are constant other than Ns", default=False) group.add_option("-g", "--gaps", action="store_true", dest="gaps", help="Gaps (-) are real", default=False) parser.add_option_group(group) @@ -76,11 +77,13 @@ def check_input_validity(options, args): elif not os.path.isfile(options.inputfile): DoError('Cannot find file '+options.inputfile) + if options.dates!='' and not os.path.isfile(options.dates): + DoError('Cannot find file '+options.dates) if options.outfile=='': options.outfile=options.ref.split("/")[-1].split(".")[0] - + options.overwrite=False while os.path.isfile(options.outfile) and options.overwrite==False: outopt="" outopt=raw_input('\nOutput files with chosen prefix already exist.\n\nWould you like to overwrite (o), choose a new output file prefix (n) or quit (Q): ') @@ -203,8 +206,21 @@ def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, head #print sequences.keys() print "Found", len(sequences.keys()), "sequences of length", alnlen sys.stdout.flush() - + dates={} + + if options.dates!="": + + for line in open(options.dates): + words=line.strip().split(',') + if len(words)<2: + continue + if words[0] in sequences.keys(): + + try: + dates[words[0]]=float(words[1]) + except ValueError: + continue snplocations=[] @@ -288,8 +304,11 @@ def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, head # alignment.add_sequence(name, ''.join(snpsequence[name])) # else: # print name, "excluded from snp alignment as it is < "+str(options.exclude)+"% mapped" - - alignment.add_sequence(name, ''.join(snpsequence[name])) + if name in dates: + alignment.add_sequence(name+"_"+str(dates[name]), ''.join(snpsequence[name])) + else: + alignment.add_sequence(name, ''.join(snpsequence[name])) + AlignIO.write([alignment], open(options.outfile, 'w'), "fasta") diff --git a/better_blast.py b/better_blast.py index 38a4d96..49cf5c6 100755 --- a/better_blast.py +++ b/better_blast.py @@ -54,7 +54,9 @@ def main(): ################################ def check_input_validity(options, args): - + + options.formatdb=True + if options.query=='': DoError('No query file selected') elif not os.path.isfile(options.query): @@ -62,7 +64,22 @@ def check_input_validity(options, args): if options.subject=='': DoError('No subject file selected') elif not os.path.isfile(options.subject): - DoError('Cannot find file '+options.subject) + if options.subject=="nt": + options.subject="/data/blastdb/Supported/nt" + options.formatdb=False + maxlength=20000 + elif options.subject=="nr": + options.subject="/data/blastdb/Supported/nr" + options.blastprog="blastx" + options.formatdb=False + maxlength=20000 + elif options.subject=="refseq": + options.subject="/data/blastdb/Supported/refseq" + options.blastprog="blastx" + options.formatdb=False + maxlength=20000 + else: + DoError('Cannot find file '+options.subject) if options.tmpdir=='': options.tmpdir="better_blast_tmp_dir" @@ -133,7 +150,7 @@ def check_input_validity(options, args): else: print "Found", len(query_seqs), "query sequences" - + mem=0.5 for query in query_seqs: if ">" in str(query.seq): print query.seq @@ -184,72 +201,76 @@ def check_input_validity(options, args): query_tmp_file.close() query_info_file.close() + if options.filter: + filter_string=" -F F " + else: + filter_string=" -F T " - print "Preprocessing subject fasta file" - sys.stdout.flush() - - subject_info_file_name=options.tmpdir+"/"+tmpname+".subjectinfo" - subject_info_file=open(subject_info_file_name, "w") - - try: - subject_seqs=read_seq_file(options.subject) - except StandardError: - DoError("Cannot read subject file") - - try: - subject_seqs=read_seq_file(options.subject) - if len(subject_seqs)==0: + if options.formatdb: + print "Preprocessing subject fasta file" + sys.stdout.flush() + + subject_info_file_name=options.tmpdir+"/"+tmpname+".subjectinfo" + subject_info_file=open(subject_info_file_name, "w") + + try: + subject_seqs=read_seq_file(options.subject) + except StandardError: + DoError("Cannot read subject file") + + try: + subject_seqs=read_seq_file(options.subject) + if len(subject_seqs)==0: + try: + subject_seqs=[open_annotation(options.subject)] + except StandardError: + DoError("Cannot read subject file") + + except StandardError: try: subject_seqs=[open_annotation(options.subject)] except StandardError: DoError("Cannot read subject file") - except StandardError: - try: - subject_seqs=[open_annotation(options.subject)] - except StandardError: - DoError("Cannot read subject file") - - if len(subject_seqs)==0: - print DoError("No subject sequence found") - else: - print "Found", len(subject_seqs), "subject sequences" - - subject_tot=0 - subject_tmp_file_name=options.tmpdir+"/"+tmpname+"_subject.fasta" - subject_tmp_file=open(subject_tmp_file_name, "w") - subject_fragment_count=0 - for subject in subject_seqs: - seqlength=len(str(subject.seq)) - print >> subject_info_file, '\t'.join(["s"+str(subject_fragment_count), subject.id, str(subject_tot)]) - print >> subject_tmp_file, ">s"+str(subject_fragment_count) - print >> subject_tmp_file, str(subject.seq) - subject_tot+=seqlength - subject_fragment_count+=1 - subject_tmp_file.close() + if len(subject_seqs)==0: + print DoError("No subject sequence found") + else: + print "Found", len(subject_seqs), "subject sequences" + + subject_tot=0 + subject_tmp_file_name=options.tmpdir+"/"+tmpname+"_subject.fasta" + subject_tmp_file=open(subject_tmp_file_name, "w") + subject_fragment_count=0 + for subject in subject_seqs: + seqlength=len(str(subject.seq)) + print >> subject_info_file, '\t'.join(["s"+str(subject_fragment_count), subject.id, str(subject_tot)]) + print >> subject_tmp_file, ">s"+str(subject_fragment_count) + print >> subject_tmp_file, str(subject.seq) + subject_tot+=seqlength + subject_fragment_count+=1 + subject_tmp_file.close() + + + + print "formatting subject database" + sys.stdout.flush() + + formatdb_command="formatdb -p F -i "+subject_tmp_file_name + formatdb_args = shlex.split(formatdb_command) + formatdb_returnval=subprocess.call(formatdb_args) - if options.filter: - filter_string=" -F F " + if formatdb_returnval!=0: + DoError("formatdb command "+formatdb_command+" failed with exit value "+str(formatdb_returnval)) else: - filter_string=" -F T " - - print "formatting subject database" - sys.stdout.flush() - - formatdb_command="formatdb -p F -i "+subject_tmp_file_name - formatdb_args = shlex.split(formatdb_command) - formatdb_returnval=subprocess.call(formatdb_args) - - if formatdb_returnval!=0: - DoError("formatdb command "+formatdb_command+" failed with exit value "+str(formatdb_returnval)) - + subject_tmp_file_name=options.subject + mem=8 print "Running blast jobs over lsf" sys.stdout.flush() blast_command="blastall -p "+options.blastprog+" -i "+query_tmp_file_name+"INDEX -m 8 -e "+str(options.e)+" -o "+options.tmpdir+"/"+tmpname+".blastINDEX -d "+subject_tmp_file_name+filter_string+" "+options.extras - job1 = farm.Bsub(options.tmpdir+"/"+tmpname+"_bb_bsub.out", options.tmpdir+"/"+tmpname+"_bb_bsub.err", tmpname+"_blast", "normal", 0.5, blast_command, start=1, end=query_file_count) + job1 = farm.Bsub(options.tmpdir+"/"+tmpname+"_bb_bsub.out", options.tmpdir+"/"+tmpname+"_bb_bsub.err", tmpname+"_blast", "normal", mem, blast_command, start=1, end=query_file_count) job1_id = job1.run() print "Job ID =", job1_id @@ -263,8 +284,9 @@ def check_input_validity(options, args): print "Job ID =", job2_id sys.stdout.flush() - print "Once these jobs are finished, you can run a comparison in act using:" - print "act", options.subject, options.prefix+".crunch.gz", options.query + if options.formatdb: + print "Once these jobs are finished, you can run a comparison in act using:" + print "act", options.subject, options.prefix+".crunch.gz", options.query job3 = farm.Bsub(options.prefix+"_bb_bsub.out", options.prefix+"_bb_bsub.err", tmpname+"_cleanup", "normal", 0.5, "rm -rf formatdb.log "+options.tmpdir) job3.add_dependency(job2_id) diff --git a/modules/Si_SNPs_temp.pyc b/modules/Si_SNPs_temp.pyc index 20dfa90..cb3942a 100644 Binary files a/modules/Si_SNPs_temp.pyc and b/modules/Si_SNPs_temp.pyc differ diff --git a/prepare_BEAST_alignment.py b/prepare_BEAST_alignment.py index a5b61cd..e0f6511 100755 --- a/prepare_BEAST_alignment.py +++ b/prepare_BEAST_alignment.py @@ -57,6 +57,7 @@ def get_user_options(): group = OptionGroup(parser, "Required") group.add_option("-i", "--input", action="store", dest="inputfile", help="Input file name", default="") group.add_option("-o", "--output", action="store", dest="outfile", help="Output file name", default="") + group.add_option("-d", "--dates", action="store", dest="dates", help="Dates csv file name (Nmae,date)", default="") group.add_option("-n", "--nons", action="store_true", dest="nons", help="Exclude sites which are constant other than Ns", default=False) group.add_option("-g", "--gaps", action="store_true", dest="gaps", help="Gaps (-) are real", default=False) parser.add_option_group(group) @@ -76,6 +77,8 @@ def check_input_validity(options, args): elif not os.path.isfile(options.inputfile): DoError('Cannot find file '+options.inputfile) + if options.dates!='' and not os.path.isfile(options.dates): + DoError('Cannot find file '+options.dates) if options.outfile=='': options.outfile=options.ref.split("/")[-1].split(".")[0] @@ -140,7 +143,22 @@ def check_input_validity(options, args): for line in lines: words=line.strip().split('\n') sequences[words[0].split()[0]]=''.join(words[1:]) - + + + + if options.dates!="": + dates={} + for line in open(options.dates): + words=line.strip().split(',') + if len(words)<2: + continue + if words[0] in sequences.keys(): + + try: + dates[words[0]]=float(words[1]) + except TypeError: + continue + print dates alnlen=len(sequences[sequences.keys()[0]])