From ed98ced34111305f1a608466fcd0e897d65b33fa Mon Sep 17 00:00:00 2001 From: "Pathogen Genomics, Wellcome Trust Sanger Institute" Date: Tue, 8 Apr 2014 23:00:29 +0100 Subject: [PATCH] Daily update of sh16 scripts --- BEAST/prepare_BEAST_alignment.py | 27 +++++- better_blast.py | 136 ++++++++++++++++++------------- modules/Si_SNPs_temp.pyc | Bin 18286 -> 19302 bytes prepare_BEAST_alignment.py | 20 ++++- 4 files changed, 121 insertions(+), 62 deletions(-) diff --git a/BEAST/prepare_BEAST_alignment.py b/BEAST/prepare_BEAST_alignment.py index 40c90a7..6bf296a 100755 --- a/BEAST/prepare_BEAST_alignment.py +++ b/BEAST/prepare_BEAST_alignment.py @@ -57,6 +57,7 @@ def get_user_options(): group = OptionGroup(parser, "Required") group.add_option("-i", "--input", action="store", dest="inputfile", help="Input file name", default="") group.add_option("-o", "--output", action="store", dest="outfile", help="Output file name", default="") + group.add_option("-d", "--dates", action="store", dest="dates", help="Dates csv file name (Nmae,date)", default="") group.add_option("-n", "--nons", action="store_true", dest="nons", help="Exclude sites which are constant other than Ns", default=False) group.add_option("-g", "--gaps", action="store_true", dest="gaps", help="Gaps (-) are real", default=False) parser.add_option_group(group) @@ -76,11 +77,13 @@ def check_input_validity(options, args): elif not os.path.isfile(options.inputfile): DoError('Cannot find file '+options.inputfile) + if options.dates!='' and not os.path.isfile(options.dates): + DoError('Cannot find file '+options.dates) if options.outfile=='': options.outfile=options.ref.split("/")[-1].split(".")[0] - + options.overwrite=False while os.path.isfile(options.outfile) and options.overwrite==False: outopt="" outopt=raw_input('\nOutput files with chosen prefix already exist.\n\nWould you like to overwrite (o), choose a new output file prefix (n) or quit (Q): ') @@ -203,8 +206,21 @@ def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, head #print sequences.keys() print "Found", len(sequences.keys()), "sequences of length", alnlen sys.stdout.flush() - + dates={} + + if options.dates!="": + + for line in open(options.dates): + words=line.strip().split(',') + if len(words)<2: + continue + if words[0] in sequences.keys(): + + try: + dates[words[0]]=float(words[1]) + except ValueError: + continue snplocations=[] @@ -288,8 +304,11 @@ def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, head # alignment.add_sequence(name, ''.join(snpsequence[name])) # else: # print name, "excluded from snp alignment as it is < "+str(options.exclude)+"% mapped" - - alignment.add_sequence(name, ''.join(snpsequence[name])) + if name in dates: + alignment.add_sequence(name+"_"+str(dates[name]), ''.join(snpsequence[name])) + else: + alignment.add_sequence(name, ''.join(snpsequence[name])) + AlignIO.write([alignment], open(options.outfile, 'w'), "fasta") diff --git a/better_blast.py b/better_blast.py index 38a4d96..49cf5c6 100755 --- a/better_blast.py +++ b/better_blast.py @@ -54,7 +54,9 @@ def main(): ################################ def check_input_validity(options, args): - + + options.formatdb=True + if options.query=='': DoError('No query file selected') elif not os.path.isfile(options.query): @@ -62,7 +64,22 @@ def check_input_validity(options, args): if options.subject=='': DoError('No subject file selected') elif not os.path.isfile(options.subject): - DoError('Cannot find file '+options.subject) + if options.subject=="nt": + options.subject="/data/blastdb/Supported/nt" + options.formatdb=False + maxlength=20000 + elif options.subject=="nr": + options.subject="/data/blastdb/Supported/nr" + options.blastprog="blastx" + options.formatdb=False + maxlength=20000 + elif options.subject=="refseq": + options.subject="/data/blastdb/Supported/refseq" + options.blastprog="blastx" + options.formatdb=False + maxlength=20000 + else: + DoError('Cannot find file '+options.subject) if options.tmpdir=='': options.tmpdir="better_blast_tmp_dir" @@ -133,7 +150,7 @@ def check_input_validity(options, args): else: print "Found", len(query_seqs), "query sequences" - + mem=0.5 for query in query_seqs: if ">" in str(query.seq): print query.seq @@ -184,72 +201,76 @@ def check_input_validity(options, args): query_tmp_file.close() query_info_file.close() + if options.filter: + filter_string=" -F F " + else: + filter_string=" -F T " - print "Preprocessing subject fasta file" - sys.stdout.flush() - - subject_info_file_name=options.tmpdir+"/"+tmpname+".subjectinfo" - subject_info_file=open(subject_info_file_name, "w") - - try: - subject_seqs=read_seq_file(options.subject) - except StandardError: - DoError("Cannot read subject file") - - try: - subject_seqs=read_seq_file(options.subject) - if len(subject_seqs)==0: + if options.formatdb: + print "Preprocessing subject fasta file" + sys.stdout.flush() + + subject_info_file_name=options.tmpdir+"/"+tmpname+".subjectinfo" + subject_info_file=open(subject_info_file_name, "w") + + try: + subject_seqs=read_seq_file(options.subject) + except StandardError: + DoError("Cannot read subject file") + + try: + subject_seqs=read_seq_file(options.subject) + if len(subject_seqs)==0: + try: + subject_seqs=[open_annotation(options.subject)] + except StandardError: + DoError("Cannot read subject file") + + except StandardError: try: subject_seqs=[open_annotation(options.subject)] except StandardError: DoError("Cannot read subject file") - except StandardError: - try: - subject_seqs=[open_annotation(options.subject)] - except StandardError: - DoError("Cannot read subject file") - - if len(subject_seqs)==0: - print DoError("No subject sequence found") - else: - print "Found", len(subject_seqs), "subject sequences" - - subject_tot=0 - subject_tmp_file_name=options.tmpdir+"/"+tmpname+"_subject.fasta" - subject_tmp_file=open(subject_tmp_file_name, "w") - subject_fragment_count=0 - for subject in subject_seqs: - seqlength=len(str(subject.seq)) - print >> subject_info_file, '\t'.join(["s"+str(subject_fragment_count), subject.id, str(subject_tot)]) - print >> subject_tmp_file, ">s"+str(subject_fragment_count) - print >> subject_tmp_file, str(subject.seq) - subject_tot+=seqlength - subject_fragment_count+=1 - subject_tmp_file.close() + if len(subject_seqs)==0: + print DoError("No subject sequence found") + else: + print "Found", len(subject_seqs), "subject sequences" + + subject_tot=0 + subject_tmp_file_name=options.tmpdir+"/"+tmpname+"_subject.fasta" + subject_tmp_file=open(subject_tmp_file_name, "w") + subject_fragment_count=0 + for subject in subject_seqs: + seqlength=len(str(subject.seq)) + print >> subject_info_file, '\t'.join(["s"+str(subject_fragment_count), subject.id, str(subject_tot)]) + print >> subject_tmp_file, ">s"+str(subject_fragment_count) + print >> subject_tmp_file, str(subject.seq) + subject_tot+=seqlength + subject_fragment_count+=1 + subject_tmp_file.close() + + + + print "formatting subject database" + sys.stdout.flush() + + formatdb_command="formatdb -p F -i "+subject_tmp_file_name + formatdb_args = shlex.split(formatdb_command) + formatdb_returnval=subprocess.call(formatdb_args) - if options.filter: - filter_string=" -F F " + if formatdb_returnval!=0: + DoError("formatdb command "+formatdb_command+" failed with exit value "+str(formatdb_returnval)) else: - filter_string=" -F T " - - print "formatting subject database" - sys.stdout.flush() - - formatdb_command="formatdb -p F -i "+subject_tmp_file_name - formatdb_args = shlex.split(formatdb_command) - formatdb_returnval=subprocess.call(formatdb_args) - - if formatdb_returnval!=0: - DoError("formatdb command "+formatdb_command+" failed with exit value "+str(formatdb_returnval)) - + subject_tmp_file_name=options.subject + mem=8 print "Running blast jobs over lsf" sys.stdout.flush() blast_command="blastall -p "+options.blastprog+" -i "+query_tmp_file_name+"INDEX -m 8 -e "+str(options.e)+" -o "+options.tmpdir+"/"+tmpname+".blastINDEX -d "+subject_tmp_file_name+filter_string+" "+options.extras - job1 = farm.Bsub(options.tmpdir+"/"+tmpname+"_bb_bsub.out", options.tmpdir+"/"+tmpname+"_bb_bsub.err", tmpname+"_blast", "normal", 0.5, blast_command, start=1, end=query_file_count) + job1 = farm.Bsub(options.tmpdir+"/"+tmpname+"_bb_bsub.out", options.tmpdir+"/"+tmpname+"_bb_bsub.err", tmpname+"_blast", "normal", mem, blast_command, start=1, end=query_file_count) job1_id = job1.run() print "Job ID =", job1_id @@ -263,8 +284,9 @@ def check_input_validity(options, args): print "Job ID =", job2_id sys.stdout.flush() - print "Once these jobs are finished, you can run a comparison in act using:" - print "act", options.subject, options.prefix+".crunch.gz", options.query + if options.formatdb: + print "Once these jobs are finished, you can run a comparison in act using:" + print "act", options.subject, options.prefix+".crunch.gz", options.query job3 = farm.Bsub(options.prefix+"_bb_bsub.out", options.prefix+"_bb_bsub.err", tmpname+"_cleanup", "normal", 0.5, "rm -rf formatdb.log "+options.tmpdir) job3.add_dependency(job2_id) diff --git a/modules/Si_SNPs_temp.pyc b/modules/Si_SNPs_temp.pyc index 20dfa90ba17a8c62338c108dbcf625261172f696..cb3942a2343c7c463ce9556775c79f5d97280080 100644 GIT binary patch delta 6166 zcmbVQeQ;b=6+idA-A#70NjAy8?&hWFgMgoBv(!5QYO}feM zd%-Q;?J8R>h1wBrh0#%*5e7%p88FP~jKjBxq86E9WUPo3DN08x@-fc9ANV`>ZIVqX zADX;(-o5vnd(OG%NHS2+tz5H zh7#4YPmSVJD~GgIA_UkZf1Q=#~XDt*qkM zO4dFaA+l~{sYRk|2HKQ5`xP1n!3uJ!9V1JpzIcA-xc-=47!$DYum<44%O>A~*S-tD z5|q~{C&;R#@e0Z`ljWgp+bxayvN>`<;3Y>;4)Q?bqoPi|yLMAvC&!(pQM56sGeHgz ztM^f)J?UZfFe7t>a$2FB_FROG5$KTa0V!D(RO-Caky(I9D8L#7LxJ+O_)tEihUyeV?f{Lf0Yu6G8cNtA1tSD=Ivx(PBm_fh z1r;@p4i2eGG8`WzD?W+{v{A?{jO&znm(5=!x=Bz9i*&ieKp)NhgNRDpnfcYi@s}}& z-YSya1r!T{nCPT%0xW(kNI5LXPpfhe9>53dufhj@2;u{iRpSHu)!+kb)#3wlh46vB z>Sz`7RRVIgSYSQIVe$riVE-^aizpYN5hQc=6sK1qS;flr?s}f`QDt6iGGKuxO6#)^ z6G9d?#d>;o-A{S!H}D3Mx{~rOl>G|tkQVkW>{wCW410H_Dc{O-b4}EkYgLO|rMdkW z`VqH@5Q;306Uxe4^Aiu2JTYmV*EgKu-GmT8RJ~G9%H)UIyPmJdo*C{Per( zHew#IMx>;Dx=!m)pAr4pTLrHI8_%q5ctyPNq(uQsf~pvZ%wfkpl+pB^2!$M?k`)Q5 z{b-J|9i~F`VP0u!dmfvmk5@46Bl`}W+ly!0{<3kj=f*KRiuiLUpfigonZjK#m~nqG zHI@u;uMWquy2zXK%Ni@GV!dPnI4Y8_pq!t>n8z?vjk&3aeH{L4R~gr#R5rjZS zkp9rQ9~wi!h}BF+0ei=s;W=ax_t8Aehb(|3Xm-1YC;K3yM)5>mxQHKm5Yr$xu${1O zND3ejBaPmVpGJlMGmrXMTnmk3gAe#PEoa+SEAjq=5m*8=8inC4BMQTW(J`=U_9=ikRYh3yonxn_aJbgiV z40gwadX|cvCdvz_C2TL)S|U)~z+Rk!6RLS521jV~@h$w}2HwR1bIzTrFW!#hXvh4S z`oA92IHhGz!(sV&LtFPKSn=PN#<&1D`>+mS%!DgB68RLK7%?a1+YNg!SN-wCKHKE; z-sDpgVUDAGa?~YFXPo$w+0iY%eXtx+-;SL)$<0YL%{+j-JG>@z8~47>;8q5=$bW>_ ztv$-kn;9GiaJZ7fWM|)0LNVIBnVaT*#HvxW*d-Js|EFH z;nS)`yB5*S+h_98Pio>p`QxT+6OIYODMd6gIi0e057`OZ{5iVjLrU(U<`!|q%stHq zG!ILVv5qxzuq{-0gh?J{@EC)~0o3BMsxmSkRP-YoR4UY~@ z+2+r9^p^~tV(=>lzh=PNnNKrtbwW!`lxNx|#DSS>7k^C?hva*$9kQz9vpv5BI$p!n zO6s|E)Pc1Vj`H`AWtqQ|M>~3V&uL{I=h<`WILvH#GG!-H_Ov}XF=jh1>xF76m6Yic zj#62fVt8_5dT+|r{uyP=+>)i@iJ9*$>D4@}DvTK$iF1i)o~5da%X8U~tG;;~&MW#+ zc7yl{&7;W|mpvlR$h*!z6<3GmAkIhxr>bjrjJ=0Ha}*L-yj%9hn`gcg|CR`>00JH zjmdVKZ10N6JzY!1L-J_X@eO}wjxRHKox#5u{DHw=8N9-P=b8Uxpq%X~6x8ygw zFM`ykyHAS5%#SXswt8jFDIHq|3Tte5sJ3Q?SARy^gal5adeiyN)4(-GGQ{}I zlbVCgcOL0t8u1y#L4czibp*HNi8a?03|dHB;$$dJ!}>xbb23nFVfLajPN)ijz?l~)p>V=2e=J|7Y8wm!iv6$dIJ3>6I;i+%%`xJo`L zFI;a{%|~ld9$qiSPi56O$Z-o}f{66~f(&yST667VY6*5)fJ_w@|sF zz(TErIuX!)fX!EOnU(ibcB|l;5(hKx2PkEujeL_oB2csO#SjDIUdp^d*)@#%}HJJZJqPd|#g z%zntPpl#4*8-#Cz^ljUrBibXl%%}HLe+nzaU8IJxgIt*)G2o=kHLNQq3Cf-buHlFv zC{n_pRP-Z)JWA@70yx#j>T75{^YGE?QmyHy&#RuF((h6~ER}?CbsKVPY=|gh52I{n z6V9r)9gCRj#o2v3%tL@Ud>`MqT0AS?+1P3r?CRHeRBctf8^vqog_~A*Riw#*O`T%D zJh*9{ugF9X$tO3B3qy8oUgV8vA+)aCd|rWL!NAK~kBWr{5ilPo7B~uKV__86`AT94 z!y!g+ZAUo3m+Lk7!VL#_!;OZoYAzDc@)ekpBM1o{P`CT{yW}Lhdd?*%TjyK?jH=#e zUGm&qAM8plshD#Km~f|>(nWX#g(|Kcg`f&;C<(r}&*EnQ6$f@ioTa%Bzeovnvvcnk z*lA8E8bkwd>jmrYqx1;@Rxls}72NPc0Jpu|b|1tX--1qvY#L!(K#Y&@4;eVkaaZKv z#|=kKoelmv;_@1gs;gD-7X|m-;4cdN0)eSJ(?x-&9~aL|QXtfl@I`=yik<1Cz|#-i z5cCVwD}0cCc`duI+;d9J2Bl{B!_~xTROjYsr#7! z=#)Ut@Er452JM_tbIGKBam4@E=O!2ACLj4`CxZ)R$JV8FZhA6iIS*bSWO~^)`Q5G2?g<^c^Yy@>z7__4 zwe#^?;q&^m2->v*J<%ybqDEAU8r0y+fzrn8ejr6E%kGTRGE1d02PL=FbqAnll zac#>5kJHC{jh&iOHxza9Wmv8{Nco;*-p}A+2ESpTT%)Z1d+wfK@FxZ@F!%(UY$Xo7 zFRZ@={Sb^1#T(S$qo*{yo_eiW)QX^1udU{PJ!tqZsqYBh>yH{eZCl$m8*3Y$5BC2T DU;)Rg delta 5524 zcmb_gdvIJ;8UN1RM|QJ$Y#y84%{!$nNek6JNTtv=od(;2)Oh02$`r9wvr5fPM!6lOqC1ZL_uBmRD8H`^47I*!S` zzk43v`ObH~$2s5G7r&rhy;U{4Uau@4w72k-77h)nu;Ym z%^)ADsaRr9EIty`(ZJ5yIyO`Q#PV8gs$DIZ{ZgP&sVV*F)tkKb4d_@rt173J$DTl0 z-&yrd54CZ5vjW`)JqEo7%MAJq`VGR*Kp8MtLG|V; z4Z^k14!;7zseo`KAiM~eucJb8^#(&U46UMM+-M$JJZMqxG?GU`X_vqh|A7d!5llbf zlH}t8?ZI-NQW6K}0&|(qJpXRo&!d`0G>`iVmG;i#KIct^1%Qqc+^b+qOg~m3S!mHK z-C3x#3?HoDC*Aohg2<05R921;^cTPf>Z-s8`l`eS3agUtlDlRrvA}AyL*+qyp#K_t z>LnYJQ3OHejHG=s?x$?M+nzT3+I|CLbh3={4_{(!jj~$ zl?8wFW17UYWeD;Tb;82lX)BAUZjcD3M{gp?XP_W&6a+KzvsyrKpYC)GgabBfED zAIh*dLpha7*%Os}HV_g;!FLs=SHh5@`o?1@FVa;@FN{3%39)VZtk^ zDHqEvb7k+mc9cu)7+s=Ad9J;vio$xqi}idimk?eZ6@T}IX|a`1HSA^2Z~kB*DBdIIp~pXF3Gh@t}QRQcE(1wog=BB zsGD=P(&f<@0u}2RRIIldaz9DJIeg058P1hc<^f0y0mC?PLfAS=il@iI?<3eq7zKsk zoOEMs9`#VJ!tg|36nYN1;nScGVLPF02nuw-M_zZoTO9qbVU&MI9QAX$M!{B@e8I!8 zoGUv`!u+ zIYPb|%a#aKS$Ccu>qFd3L0?gzuaFWwS}sT* z^;Lj)^zvw%?uq(l`=YhVdk9VT?Rsb9dUdD1yYU6pp>Ju5s@rG3(exfm9nnv;WE!VI zgN;mNboZXP^NHb9EM?!raYqgRBdyKq^4Zg^H(KhDuIai+Pjz(V?_>W@2*(J=0cHi6 zaaX@D!OZO9g3NULQi;wIWo=Vw2V(ZTcKPNmu_yvKY*$)y50-d1mgwQKg!f?FumaLYxyd(qZa523rWiqt0v7m|_2O2yK~Xv2J62ovVX~e z1#xuE;&8WNbiPnV0HdW?+6ZVCKC*jq&#rhu)`yMWzP@;o`r+)ui+e5g5cJmd@wdri z{OCf8VmfwCQUASi%&gCMEwo-&+PCDmI-^f6InxNI?4<8*9eWFZHa(m(OK0Qk<4eD+ zsyY=a#=BFo$x%JL_^Ku*c|${E@v-#K&@!ZGsTCYx`IXWcC*?#`{1&h z+$ya^%2AFPwadflfSa<4}{+n%#2U5 z^gQ8@gck^B2(J;|0vIjWZx|I;uL-C-bkmyks#$+*&1F?Tc0oNj2eD=KZvXb|SJqr> zEg8p_h9oV>#^m>{bWO1uMe9(V#E})@5Mfy_U3*==UO-VCQ*Xwsb`{6eBcB0XXx^BR zVTKL8+-%k3E>4AO&0G})4)IBR|M`@M=TYMHJ?ug{N6HN#{m?v8^c|Z=isRsz3n^cb z>BU9L!mK7KQrH_7iTAwVWULbGn) zaBcocG+xSJ;7;jP{&v47w9a2Y!X1TsjtB2Xys4WoCT*^JJIvLAsffApA$!A>0(VvT z40916PFiHVxYU>e5WTrFrd~OpWG+)#FVmQbGD+sMDvJyX7Za~UCOndSUNZaH*}Hv8 zkO|>dh%6}6LnUF9@zoWw4Xrp`VOl(J$N+L-7=Omp-G=cDgC9ft5;KBl0BKQjf|rC% zV6zExZo;ISHs_gK`DE0R{=|d~#IYhDFIkx(VdCgFDa#hhNXJLU6~^TU{sBQ?s3dxj zO3_zbX4Wf9b*#2ht|SkytOi4HCKFbCl6XZ1pp|j2^dhsP-jL~oHRfJu!CXZTiCcw6VBw)J<*@Rb3Icxch zO`fC|z={QHnO_x()eN;06#ZOE*W+srA`2e(AuC5m!;BymcC1r!F~1()%{r35!~6IE5v7|rI9Rr@7zjfrbI3RF8|fe?Yje#8j` zFhMoyMy4=7THvwvPQvAYw`C7w^y@LiikKlbgUW7W?3BVL&0mFjQt9Djx)^2lqoisl z)wwV`fwoRYhz)tf?qKf%!bOCIx^MGHXc3G1*u9vgiwR5g$<2#urrCIuz!Ti=QhI3d zCVkbGh#J(_Z)pr87#WcG?@~ckr7BgGRc*EE+qSedR-!eknh~x(%Zr|XRc=)%zpt}J zpWL!S-KYPtrDf8bKyX~KFX04ciA)$w7MO}pnS|XOI0+J&ql>?9+21AHOE^g|3NlN7 zgw-bqza{Wov6qtfN}Mwv(2;9uS1dzY5b{RwR#