forked from widdowquinn/find_differential_primers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_differential_primers.py
1616 lines (1553 loc) · 78 KB
/
find_differential_primers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# find_differential_primers.py
#
# A Python script that identifies pairs of forward and reverse primers which
# are capable of amplifying either individual organisms, or a particular
# family of organisms, from a set of genome sequences. Primers are expected
# to be located within CDS features, in an attempt to maximise sequence
# stability of the primers.
#
# The script reads from a configuration file containing sequence names and,
# at a minimum, the location of a complete genome sequence. Optionally, the
# configuration file may also indicate:
# - the location of a GenBank file containing CDS feature locations,
# or an equivalent output file from the Prodigal genefinder
# (http://compbio.ornl.gov/prodigal/)
# - the locations on the genome, and sequences of, primers predicted in
# EMBOSS ePrimer3 output format
# (http://emboss.bioinformatics.nl/cgi-bin/emboss/help/eprimer3)
#
# The first step of the script, if no primer file is specified, is to use
# the sequence file as the basis for a call to EMBOSS ePrimer3
# (http://emboss.bioinformatics.nl/cgi-bin/emboss/help/eprimer3), which must
# be installed and either on the $PATH, or its location specified at the
# command line. This will generate an output file with the same stem as the
# sequence file, but with the extension '.eprimer3'. Some ePrimer3 settings,
# such as the number of primers to find, are command-line options.
#
# If no CDS feature file is specified, and the --noCDS flag is not set,
# the script will attempt first to use Prodigal
# (http://compbio.ornl.gov/prodigal/) to predict CDS locations, placing the
# output in the same directory as the sequence source. If Prodigal cannot be
# found, a warning will be given, and the script will proceed as if the
# --noCDS flag is set. If this flag is set, then all primers are carried
# through to a query with the EMBOSS PrimerSearch package
# (http://emboss.bioinformatics.nl/cgi-bin/emboss/help/primersearch) against
# all other sequences in the dataset. If the flag is not set, then all
# primers that are not located within a CDS feature are excluded from the
# PrimerSearch input. To enable this, the PrimerSearch input is written to
# an intermediate file with the same stem as the input sequence, but the
# extension '.primers'.
#
# A run of PrimerSearch is carried out with every set of primers against
# all other sequences in the dataset. The output of this search is written to
# a file with the following naming convention:
# <query>_primers_vs_<target>.primersearch
# Where <query> is the name given to the query sequence in the config file, and
# <target> is the name given to the target sequence in the config file. This
# step is not carried out if the --noprimersearch flag is set. When this flag
# is set, the script will look for the corresponding PrimerSearch output in
# the same directory as the sequence file, and will report an error if it is
# not present.
#
# Finally, the script uses the PrimerSearch results to identify primers that
# are unique to each query sequence, and to each family named in the config
# file. These are reported in files with the following naming convention:
# <query>_specific_primers.eprimer3
# <family>_specific_primers.primers
# We use ePrimer3 format for the family-specific primers, even though the
# start and end positions are meaningless, as they will amplify at different
# sites in each family member. However, the source sequence is indicated in a
# comment line, and the primer sequences and T_m/GC% values should be the same,
# regardless.
# Primers that are universal to all sequences in the sample are written in
# ePrimer3 format to the file:
# universal_primers.eprimer3
# This file has the same caveats as the family-specific file above.
#
# Copyright (C) 2011 Leighton Pritchard
#
# Contact:
#
# Leighton Pritchard,
# Information and Computing Sciences,
# James Hutton Institute,
# Errol Road,
# Invergowrie,
# Dundee,
# DD6 9LH,
# Scotland,
# UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# TODO:
#
# 1) Make each of the callout routines create a list of command-line calls, so
# that these can be passed either to SGE or run via multiprocessing
# 2) Create a second parallelising callout function that runs SGE command-lines
# and make this safe for asynchronous passage to the next stage of callouts
# (maybe that should be 2a)
# 3) Make the input configuration file (optionally) XML
###
# IMPORTS
from Bio import SeqIO # Parsing biological sequence data
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML # BLAST XML parser
from Bio.Emboss.Applications import Primer3Commandline, PrimerSearchCommandline
from Bio.Emboss import Primer3, PrimerSearch # Primer3, PrimerSearch parsers
from Bio.GenBank import _FeatureConsumer # For parsing GenBank locations
from Bio.Seq import Seq # Represents a sequence
from Bio.SeqRecord import SeqRecord # Represents an annotated record
from Bio.SeqFeature import SeqFeature # Represents an annotated record
from bx.intervals.cluster import ClusterTree # Interval tree building
from collections import defaultdict # Syntactic sugar
from optparse import OptionParser # Cmd-line parsing
import multiprocessing
import os
import subprocess
import sys
import time
###
# CLASSES
# Class describing an organism's genome, and associated data.
class GenomeData:
""" Describes an organism's genome, and has attributes:
name - short, unique (not enforced) identification string
family - string indicating a family membership
seqfilename - location of representative genome sequence file
ftfilename - location of GBK/Prodigal feature file
primerfilename - location of ePrimer3 format primers file
primers - dictionary collection of Bio.Emboss.Primer3.Primer
objects, keyed by primer name
Exposed methods are:
"""
def __init__(self, name, family=None, seqfilename=None, ftfilename=None,
primerfilename=None, primersearchfilename=None):
""" Expects at minimum a name to identify the organism. Optionally
filenames describing the location of sequence, feature, and
primer data may be specified, along with a family classification.
name - short, unique (not enforced) identification string
family - string indicating a family membership
seqfilename - location of representative genome sequence file
ftfilename - location of GBK/Prodigal feature file
primerfilename - location of ePrimer3 format primers file
primersearchfilename - location of PrimerSearch format primers file
Rather hackily, passing '-' to any of the keyword arguments also
sets them to None; this is to aid in config file parsing, and
is a wee bit ugly.
"""
self.name = name # Short identifier
self.family = family if family != '-' else None
self.seqfilename = seqfilename if seqfilename != '-' else None
self.ftfilename = ftfilename if ftfilename != '-' else None
self.primerfilename = primerfilename if primerfilename != '-' \
else None
self.primersearchfilename = primersearchfilename if\
primersearchfilename != '-' else None
self.primers = {} # Dict of Primer objects, keyed by name
self.load_sequence()
def load_sequence(self):
""" Load the sequence defined in self.seqfile into memory. We
assume it's FASTA format. This can then be used to calculate
amplicons when loading primers in.
"""
if self.seqfilename is not None:
try:
self.sequence = SeqIO.read(open(self.seqfilename, 'rU'), 'fasta')
except ValueError, data:
print "Loading sequence file %s failed" % self.seqfilename
print "Error message: ", data
sys.exit(1)
def write_primers(self, verbose=False):
""" Write the primer pairs in self.primers out to file in an
appropriate format for PrimerSearch. If the filename is not
already defined, the filestem of the
source sequencefile is used for the output file, with the
extension '.primers'.
The method returns the number of lines written.
"""
# Define output filename, if not already defined
if self.primersearchfilename is None:
self.primersearchfilename = os.path.splitext(self.seqfilename)[0]+\
'.primers'
if verbose:
t0 = time.time()
print "Writing primers to file %s ..." % self.primersearchfilename
# Open handle and write data
outfh = open(self.primersearchfilename, 'w')
print >> outfh, "# Primers for %s" % self.name
print >> outfh, "# Automatically generated by find_differential_primers"
for primers in self.primers.values():
print >> outfh, "%s\t%s\t%s" %\
(primers.name, primers.forward_seq,
primers.reverse_seq)
if not len(self.primers):
print >> sys.stderr, "WARNING: no primers written to %s!" % \
self.primersearchfilename
# Being tidy
outfh.close()
if verbose:
print "... wrote %d primers to %s (%.3fs)" %\
(len(self.primers), self.primersearchfilename, time.time() - t0)
def get_unique_primers(self, cds_overlap=False,
gc3primevalid=False, oligovalid=False,
blastfilter=False, single_product=False,
verbose=False):
""" Returns a list of primers that have the .amplifies_organism
attribute, but where this is an empty set.
If cds_overlap is True, then this list is restricted to those
primers whose .cds_overlap attribute is also True
"""
return self.get_primers_amplify_count(0, cds_overlap, gc3primevalid,
oligovalid, blastfilter, single_product,
verbose)
def get_family_unique_primers(self, family_members, cds_overlap=False,
gc3primevalid=False, oligovalid=False,
blastfilter=False, single_product=False,
verbose=False):
""" Returns a list of primers that have the .amplifies_organism
attribute, and where the set of organisms passed in family_members
is the same as that in .amplifies organism, with the addition of
self.name.
If cds_overlap is True, then this list is restricted to those
primers whose .cds_overlap attribute is also True
"""
primerlist = []
for p in self.primers.values():
if family_members == set([self.name]).union(p.amplifies_organism):
primerlist.append(p)
if verbose:
print "[%s] %d family primers" % (self.name,
len(primerlist))
if cds_overlap:
primerlist = [p for p in primerlist if p.cds_overlap]
if verbose:
print "[%s] %d primers after CDS filter" % \
(self.name, len(primerlist))
if gc3primevalid:
primerlist = [p for p in primerlist if p.gc3primevalid]
if verbose:
print "[%s] %d primers after GC 3` filter" % \
(self.name, len(primerlist))
if oligovalid:
primerlist = [p for p in primerlist if p.oligovalid]
if verbose:
print "[%s] %d primers after oligo filter" % \
(self.name, len(primerlist))
if blastfilter:
primerlist = [p for p in primerlist if p.blastpass]
if verbose:
print "[%s] %d primers after BLAST filter" % \
(self.name, len(primerlist))
if single_product:
primerlist = [p for p in primerlist if p.negative_control_amplimers == 1]
if verbose:
print "[%s] %d primers after single_product filter" % \
(self.name, len(primerlist))
if verbose:
print "[%s] returning %d primers" % (self.name, len(primerlist))
return primerlist
def get_primers_amplify_count(self, count, cds_overlap=False,
gc3primevalid=False, oligovalid=False,
blastfilter=False, single_product=False,
verbose=False):
""" Returns a list of primers that have the .amplifies_organism
attribute and the length of this set is equal to the passed count.
If cds_overlap is True, then this list is restricted to those
primers whose .cds_overlap attribute is also True
"""
primerlist = [p for p in self.primers.values() if \
count == len(p.amplifies_organism)]
if verbose:
print "[%s] %d family primers that amplify %d orgs" % \
(self.name, len(primerlist), count)
if cds_overlap:
primerlist = [p for p in primerlist if p.cds_overlap]
if verbose:
print "[%s] %d primers after CDS filter" % \
(self.name, len(primerlist))
if gc3primevalid:
primerlist = [p for p in primerlist if p.gc3primevalid]
if verbose:
print "[%s] %d primers after GC 3` filter" % \
(self.name, len(primerlist))
if oligovalid:
primerlist = [p for p in primerlist if p.oligovalid]
if verbose:
print "[%s] %d primers after oligo filter" % \
(self.name, len(primerlist))
if blastfilter:
primerlist = [p for p in primerlist if p.blastpass]
if verbose:
print "[%s] %d primers after BLAST filter" % \
(self.name, len(primerlist))
if single_product:
primerlist = [p for p in primerlist if p.negative_control_amplimers == 1]
if verbose:
print "[%s] %d primers after single_product filter" % \
(self.name, len(primerlist))
if verbose:
print "[%s] returning %d primers" % (self.name, len(primerlist))
return primerlist
def __str__(self):
""" Pretty string description of object contents
"""
outstr = ['GenomeData object: %s' % self.name]
outstr.append('Family: %s' % self.family)
outstr.append('Sequence file: %s' % self.seqfilename)
outstr.append('Feature file: %s' % self.ftfilename)
outstr.append('Primers file: %s' % self.primerfilename)
outstr.append('PrimerSearch file: %s' % self.primersearchfilename)
outstr.append('Primers: %d' % len(self.primers))
if len(self.primers):
outstr.append('Primers overlapping CDS: %d' %\
len([p for p in self.primers.values()\
if p.cds_overlap]))
return os.linesep.join(outstr) + os.linesep
###
# FUNCTIONS
# Parse command-line options
def parse_cmdline(args):
""" Parse command line, accepting args obtained from sys.argv
"""
usage = "usage: %prog [options] arg"
parser = OptionParser(usage)
parser.add_option("-i", "--infile", dest="filename", action="store",
help="location of configuration file",
default=None)
parser.add_option("--nocds", dest="nocds", action="store_true",
help="do not restrict primer prediction to CDS",
default=False)
parser.add_option("--noprodigal", dest="noprodigal", action="store_true",
help="do not carry out Prodigal prediction step",
default=False)
parser.add_option("--noprimer3", dest="noprimer3", action="store_true",
help="do not carry out ePrimer3 prediction step",
default=False)
parser.add_option("--noprimersearch", dest="noprimersearch",
action="store_true",
help="do not carry out PrimerSearch step",
default=False)
parser.add_option("--noclassify", dest="noclassify",
action="store_true",
help="do not carry out primer classification step",
default=False)
parser.add_option("--single_product", dest="single_product",
action="store",
help="location of FASTA sequence file containing sequences from which " +\
"a sequence-specific primer must amplify exactly one product.",
default=None)
parser.add_option("--filtergc3prime", dest="filtergc3prime",
action="store_true",
help="allow no more than two GC at the 3` end of primers",
default=False)
parser.add_option("--prodigal", dest="prodigal_exe", action="store",
help="location of Prodigal executable",
default="prodigal")
parser.add_option("--eprimer3", dest="eprimer3_exe", action="store",
help="location of EMBOSS eprimer3 executable",
default="eprimer3")
parser.add_option("--numreturn", dest="numreturn", action="store",
help="number of primers to find",
default=20, type="int")
parser.add_option("--osize", dest="osize", action="store",
help="optimal size for primer oligo",
default=20, type="int")
parser.add_option("--minsize", dest="minsize", action="store",
help="minimum size for primer oligo",
default=18, type="int")
parser.add_option("--maxsize", dest="maxsize", action="store",
help="maximum size for primer oligo",
default=22, type="int")
parser.add_option("--otm", dest="otm", action="store",
help="optimal melting temperature for primer oligo",
default=59, type="int")
parser.add_option("--mintm", dest="mintm", action="store",
help="minimum melting temperature for primer oligo",
default=58, type="int")
parser.add_option("--maxtm", dest="maxtm", action="store",
help="maximum melting temperature for primer oligo",
default=60, type="int")
parser.add_option("--ogcpercent", dest="ogcpercent", action="store",
help="optimal %GC for primer oligo",
default=55, type="int")
parser.add_option("--mingc", dest="mingc", action="store",
help="minimum %GC for primer oligo",
default=30, type="int")
parser.add_option("--maxgc", dest="maxgc", action="store",
help="maximum %GC for primer oligo",
default=80, type="int")
parser.add_option("--psizeopt", dest="psizeopt", action="store",
help="optimal size for amplified region",
default=100, type="int")
parser.add_option("--psizemin", dest="psizemin", action="store",
help="minimum size for amplified region",
default=50, type="int")
parser.add_option("--psizemax", dest="psizemax", action="store",
help="maximum size for amplified region",
default=150, type="int")
parser.add_option("--maxpolyx", dest="maxpolyx", action="store",
help="maximum run of repeated nucleotides in primer",
default=3, type="int")
parser.add_option("--mismatchpercent", dest="mismatchpercent",
action="store",
help="allowed percentage mismatch in primersearch",
default=10, type="int")
parser.add_option("--hybridprobe", dest="hybridprobe", action="store_true",
help="generate internal oligo as a hybridisation probe",
default=False)
parser.add_option("--oligoosize", dest="oligoosize", action="store",
help="optimal size for internal oligo",
default=20, type="int")
parser.add_option("--oligominsize", dest="oligominsize", action="store",
help="minimum size for internal oligo",
default=13, type="int")
parser.add_option("--oligomaxsize", dest="oligomaxsize", action="store",
help="maximum size for internal oligo",
default=30, type="int")
parser.add_option("--oligootm", dest="oligootm", action="store",
help="optimal melting temperature for internal oligo",
default=69, type="int")
parser.add_option("--oligomintm", dest="oligomintm", action="store",
help="minimum melting temperature for internal oligo",
default=68, type="int")
parser.add_option("--oligomaxtm", dest="oligomaxtm", action="store",
help="maximum melting temperature for internal oligo",
default=70, type="int")
parser.add_option("--oligoogcpercent", dest="oligoogcpercent",
action="store",
help="optimal %GC for internal oligo",
default=55, type="int")
parser.add_option("--oligomingc", dest="oligomingc", action="store",
help="minimum %GC for internal oligo",
default=30, type="int")
parser.add_option("--oligomaxgc", dest="oligomaxgc", action="store",
help="maximum %GC for internal oligo",
default=80, type="int")
parser.add_option("--oligomaxpolyx", dest="oligomaxpolyx", action="store",
help="maximum run of repeated nt in internal oligo",
default=3, type="int")
parser.add_option("--cpus", dest="cpus", action="store",
help="number of CPUs to use in multiprocessing",
default=multiprocessing.cpu_count(), type="int")
parser.add_option("--sge", dest="sge", action="store_true",
help="use SGE job scheduler",
default=False)
parser.add_option("-o", "--outdir", dest="outdir", action="store",
help="directory for output files",
default="differential_primer_results")
parser.add_option("-v", "--verbose", action="store_true", dest="verbose",
help="report progress to stdout",
default=False)
parser.add_option("--clean", action="store_true", dest="clean",
help="clean up old output files before running",
default=False)
parser.add_option("--cleanonly", action="store_true", dest="cleanonly",
help="clean up old output files and exit",
default=False)
parser.add_option("--blast_exe", dest="blast_exe", action="store",
help="location of BLASTN/BLASTALL executable",
default="blastn")
parser.add_option("--blastdb", dest="blastdb", action="store",
help="location of BLAST database",
default=None)
parser.add_option("--useblast", dest="useblast", action="store_true",
help="use existing BLAST results",
default=False)
(options, args) = parser.parse_args()
return (options, args)
# Create a list of GenomeData objects corresponding to config file entries
def create_gd_from_config(filename, verbose):
""" Parses data from a configuration file into a list of GenomeData
objects.
Returns a list of GenomeData objects.
Each line of the config file describes a single genome.
The config file format is six tab-separated columns, where columns
may be separated by multiple tabs. 'Empty' data values are indicated
by the '-' symbol, and these are converted into None objects in
parsing.
Comment lines start with '#', as in Python.
The five columns are:
1) Genome name
2) Genome family
3) Location of FASTA format sequence data
4) Location of GENBANK/PRODIGAL format feature data
5) Location of EPRIMER3 format primer data
6) Location of PRIMERSEARCH input format primer data
The data would, of course, be better presented as an XML file, but it
might be useful to maintain both tab- and XML-formatted approaches to
facilitate human construction as well as computational.
"""
if verbose:
t0 = time.time()
print "Creating list of genomes from config file %s ..." % filename
gdlist = [] # Hold GenomeData objects
# Ignore blank lines and comments...
for line in [l.strip() for l in open(filename, 'rU')\
if l.strip() and not l.startswith('#')]:
# Split data and create new GenomeData object, adding it to the list
data = [e.strip() for e in line.strip().split('\t') if e.strip()]
name, family, sfile, ffile, pfile, psfile = tuple(data)
gdlist.append(GenomeData(name, family, sfile, ffile, pfile, psfile))
if verbose:
print "... created GenomeData object for %s ..." % name
print gdlist[-1]
if verbose:
print "... created %d GenomeData objects (%.3fs)" % (len(gdlist),
time.time() - t0)
return gdlist
# Check whether each GenomeData object has multiple sequence and, if so,
# concatenate them sensibly, resetting feature and primer file locations to
# None
def check_single_sequence(gdlist, verbose):
""" Loops over the GenomeData objects in the passed list and, where the
sequence file contains multiple sequences, concatenates them into
a single sequence using a spacer that facilitates gene-finding. As
this process changes feature and primer locations, the ftfilename and
primerfilename attributes are reset to None, and these are
recalculated later on in the script, where necessary.
"""
if verbose:
t0 = time.time()
print "Checking for multiple sequences ..."
for gd in gdlist:
# Verify that the sequence file contains a single sequence
seqdata = [s for s in SeqIO.parse(open(gd.seqfilename, 'rU'), 'fasta')]
if len(seqdata) != 1:
if verbose:
print "... %s describes multiple sequences ..." %\
gd.seqfilename
gd.seqfilename = concatenate_sequences(gd, verbose) # Concatenate
if verbose:
print "... clearing feature and primer file locations ..."
gd.ftfilename, gd.primerfilename, gd.primersearchfilename = \
None, None, None
if verbose:
print "... checked %d GenomeData objects (%.3fs)" % (len(gdlist),
time.time()-t0)
# return gdlist
# Concatenate multiple fragments of a genome to a single file
def concatenate_sequences(gd, verbose=False):
""" Takes a GenomeData object and concatenates sequences with the spacer
sequence NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN (this contains
start and stop codons in all frames, to cap individual sequences).
We write this data out to a new file
For filename convention, we just add '_concatenated' to the end
of the sequence filestem, and use the '.fas' extension.
"""
# Spacer contains start and stop codons in all six frames
spacer = 'NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN'
if verbose:
t0 = time.time()
print "Concatenating sequences from %s ..." % gd.seqfilename
newseq = SeqRecord(Seq(spacer.join([s.seq.data for s in \
SeqIO.parse(open(gd.seqfilename, 'rU'),
'fasta')])),
id=gd.name+"_concatenated",
description="%s, concatenated with spacers" %\
gd.name)
outfilename = os.path.splitext(gd.seqfilename)[0] + '_concatenated' +\
'.fas'
SeqIO.write([newseq], open(outfilename, 'w'), 'fasta')
if verbose:
print "... wrote concatenated data to %s (%.3fs)" % (outfilename,
time.time() - t0)
return outfilename
# Check for each GenomeData object in a passed list, the existence of
# the feature file, and create one using Prodigal if it doesn't exist already
def check_ftfilenames(gdlist, prodigal_exe, poolsize, sge, verbose):
""" Loop over the GenomeData objects in gdlist and, where no feature file
is specified, add the GenomeData object to the list of
packets to be processed in parallel by Prodigal using multiprocessing.
"""
if verbose:
t0 = time.time()
print "Checking and predicting features for GenomeData files ..."
# We split the GenomeData objects into those with, and without,
# defined feature files, but we don't test the validity of the files
# that were predefined, here.
gds_with_ft = [gd for gd in gdlist if
(gd.ftfilename is not None and \
os.path.isfile(gd.ftfilename))]
gds_no_ft = [gd for gd in gdlist if \
(gd.ftfilename is None or \
not os.path.isfile(gd.ftfilename))]
# Predict features for those GenomeData objects with no feature file
if verbose:
print "... %d GenomeData objects have no feature file ..." %\
len(gds_no_ft)
print "... running %d Prodigal jobs to predict CDS ..." %\
len(gds_no_ft)
# Create a list of command-line tuples, for Prodigal
# gene prediction applied to each GenomeData object in gds_no_ft.
clines = []
for gd in gds_no_ft:
gd.ftfilename = os.path.splitext(gd.seqfilename)[0] + '.prodigalout'
seqfilename = os.path.splitext(gd.seqfilename)[0] + '.features'
cline = "%s -a %s < %s > %s" % (prodigal_exe, seqfilename,
gd.seqfilename, gd.ftfilename)
clines.append(cline)
if verbose:
print "... Prodigal jobs to run:"
for cline in clines:
print cline
# Depending on the type of parallelisation required, these command-lines
# are either run locally via multiprocessing, or passed out to SGE
if not sge:
multiprocessing_run(clines, poolsize, verbose)
else:
sge_run(clines, verbose)
# Check whether GenomeData objects have a valid primer definition file
def check_primers(gdlist, verbose):
""" Loop over GenomeData objects in the passed gdlist and, if they have
a defined primerfilename attribute, attempt to parse it. If this
is successful, do nothing. If it fails, set the primerfilename
attribute to None.
"""
if verbose:
t0 = time.time()
print "Checking ePrimer3 output files ..."
for gd in [g for g in gdlist if g.primerfilename]:
try:
Primer3.read(open(gd.primerfilename, 'rU'))
if verbose:
print "... %s primer file %s OK ..." % (gd.name,
gd.primerfilename)
except:
if verbose:
print "... %s primer file %s not OK ..." % (gd.name,
gd.primerfilename)
gd.primerfilename = None
# Check for each GenomeData object in a passed list, the existence of
# the ePrimer3 file, and create one using ePrimer3 if it doesn't exist already
def predict_primers(gdlist, eprimer3_exe, poolsize, numreturn,
osize, minsize, maxsize,
otm, mintm, maxtm,
ogcpercent, mingc, maxgc,
psizeopt, psizemin, psizemax,
maxpolyx,
hybridprobe, oligoosize, oligominsize, oligomaxsize,
oligootm, oligomintm, oligomaxtm, oligoogcpercent,
oligomingc, oligomaxgc, oligomaxpolyx,
sge, verbose):
""" Loop over the GenomeData objects in gdlist and, where no primer file
is specified, add the GenomeData object to the list of
packets to be processed in parallel by Prodigal using multiprocessing.
"""
if verbose:
t0 = time.time()
print "Checking and predicting primers for GenomeData files ..."
input_count = len(gdlist) # For sanity later
# We need to split the GenomeData objects into those with, and without,
# defined primer files, but we don't test the validity of these files
gds_with_primers = [gd for gd in gdlist if gd.primerfilename is not None]
gds_no_primers = [gd for gd in gdlist if gd.primerfilename is None]
# Predict primers for those GenomeData objects with no primer file
if verbose:
print "... %d GenomeData objects have no primer file ..." %\
len(gds_no_primers)
print "... running %d ePrimer3 jobs to predict CDS ..." %\
len(gds_no_primers)
# Create command-lines to run ePrimer3
clines = []
for gd in gds_no_primers:
# Create ePrimer3 command-line.
cline = Primer3Commandline()
cline.sequence = gd.seqfilename
cline.auto = True
cline.osize = "%d" % osize # Optimal primer size
cline.minsize = "%d" % minsize # Min primer size
cline.maxsize = "%d" % maxsize # Max primer size
cline.otm = "%d" % otm # Optimal primer Tm
cline.mintm = "%d" % mintm # Min primer Tm
cline.maxtm = "%d" % maxtm # Max primer Tm
cline.ogcpercent = "%d" % ogcpercent # Optimal primer %GC
cline.mingc = "%d" % mingc # Min primer %GC
cline.maxgc = "%d" % maxgc # Max primer %GC
cline.psizeopt = "%d" % psizeopt # Optimal product size
cline.maxpolyx = "%d" % maxpolyx # Longest polyX run in primer
cline.prange = "%d-%d" % (psizemin, psizemax) # Allowed product sizes
cline.numreturn = "%d" % numreturn # Number of primers to predict
cline.hybridprobe = hybridprobe # Predict internal oligo?
cline.osizeopt = "%d" % oligoosize # Internal oligo parameters;
cline.ominsize = "%d" % oligominsize # We use EMBOSS v6 parameter
cline.omaxsize = "%d" % oligomaxsize # names, here.
cline.otmopt = "%d" % oligootm
cline.otmmin = "%d" % oligomintm
cline.otmmax = "%d" % oligomaxtm
cline.ogcopt = "%d" % oligoogcpercent
cline.ogcmin = "%d" % oligomingc
cline.ogcmax = "%d" % oligomaxgc
cline.opolyxmax = "%d" % oligomaxpolyx
cline.outfile = os.path.splitext(gd.seqfilename)[0] + '.eprimer3'
gd.primerfilename = cline.outfile
clines.append(cline)
if verbose:
print "... ePrimer3 jobs to run:"
for cline in clines:
print cline
# Parallelise jobs
if not sge:
multiprocessing_run(clines, poolsize, verbose)
else:
sge_run(clines, verbose)
# Load primers from ePrimer3 files into each GenomeData object
def load_primers(gdlist, minlength, nocds=True, filtergc3prime=False,
hybridprobe=False, verbose=False):
""" Load primer data from an ePrimer3 output file into a dictionary of
Bio.Emboss.Primer3.Primer objects (keyed by primer name) in a
GenomeData object, for each such object in the passed list.
Each primer object is given a new ad hoc attribute 'cds_overlap' which
takes a Boolean, indicating whether the primer is found wholly within
a CDS defined in the GenomeData object's feature file; this status
is determined using an interval tree approach.
"""
if verbose:
t0 = time.time()
print "Loading primers, %sfiltering on CDS overlap" %\
('not ' if nocds else '')
# Load in the primers, assigning False to a new, ad hoc attribute called
# cds_overlap in each
for gd in gdlist:
if verbose:
print "... loading primers into %s from %s ..." %\
(gd.name, gd.primerfilename)
try:
os.path.isfile(gd.primerfilename)
except TypeError:
raise IOError("Primer file %s does not exist." % gd.primerfilename)
primers = Primer3.read(open(gd.primerfilename, 'rU')).primers
# Add primer pairs to the gd.primers dictionary
primercount = 0
for primer in primers:
primercount += 1
primer.cds_overlap = False # default state
primer.name = "%s_primer_%04d" % (gd.name, primercount)
primer.amplifies_organism = set() # Organisms amplified
primer.amplifies_family = set() # Organism families amplified
primer.gc3primevalid = True # Passes GC 3` test
primer.oligovalid = True # Oligo passes filter
primer.blastpass = True # Primers pass BLAST screen
gd.primers.setdefault(primer.name, primer)
primer.amplicon = \
gd.sequence[primer.forward_start-1:\
primer.reverse_start-1+primer.reverse_length]
primer.amplicon.description = primer.name
if verbose:
print "... loaded %d primers into %s ..." %\
(len(gd.primers), gd.name)
# Now that the primers are in the GenomeData object, we can filter
# them on location, if necessary
if not nocds:
filter_primers(gd, minlength, verbose)
# We also filter primers on the basis of GC presence at the 3` end
if filtergc3prime:
filter_primers_gc_3prime(gd, verbose)
# Filter primers on the basis of internal oligo characteristics
if hybridprobe:
filter_primers_oligo(gd, verbose)
# Filter primers in a passed gd object on the basis of CDS features
def filter_primers(gd, minlength, verbose):
""" Takes a passed GenomeData object, and the minimum size of an amplified
region, and then uses a ClusterTree to find clusters of CDS and
primer regions that overlap by this minimum size.
There is a possibility that, by stacking primer regions, some of
the reported overlapping primers may in fact not overlap CDS regions
directly, so this function may overreport primers.
"""
# Load in the feature data. This is done using either SeqIO for
# files with the .gbk extension, or an ad hoc parser for
# .prodigalout prediction files
if verbose:
t0 = time.time()
print "Loading feature data from %s ..." % gd.ftfilename
if os.path.splitext(gd.ftfilename)[-1] == '.gbk': # GenBank
seqrecord = [r for r in SeqIO.parse(open(gd.ftfilename, 'rU'),
'genbank')]
elif os.path.splitext(gd.ftfilename)[-1] == '.prodigalout':
seqrecord = parse_prodigal_features(gd.ftfilename)
else:
raise IOError, "Expected .gbk or .prodigalout file extension"
if verbose:
print "... loaded %d features ..." % len(seqrecord.features)
# Use a ClusterTree as an interval tree to identify those
# primers that overlap with features. By setting the minimum overlap to
# the minimum size for a primer region, we ensure that we capture every
# primer that overlaps a CDS feature by this amount, but we may also
# extend beyond the CDS by stacking primers, in principle.
if verbose:
print "... adding CDS feature locations to ClusterTree ..."
ct = ClusterTree(-minlength, 2)
# Loop over CDS features and add them to the tree with ID '-1'. This
# allows us to easily separate the features from primers when reviewing
# clusters.
for ft in [f for f in seqrecord.features if f.type == 'CDS']:
ct.insert(ft.location.nofuzzy_start, ft.location.nofuzzy_end, -1)
# ClusterTree requires us to identify elements on the tree by integers,
# so we have to relate each primer added to an integer in a temporary
# list of the gd.primers values
if verbose:
print "... adding primer locations to cluster tree ..."
aux = gd.primers.values()
for i in range(len(gd.primers)):
ct.insert(aux[i].forward_start,
aux[i].reverse_start + aux[i].reverse_length,
i)
# Now we find the overlapping regions, extracting all element ids that are
# not -1. These are the indices for aux, and we modify the gd.cds_overlap
# attribute directly
if verbose:
print "... finding overlapping primers ..."
overlap_primer_ids = set() # CDS overlap primers
for (s, e, ids) in ct.getregions():
primer_ids = set([i for i in ids if i != -1]) # get non-feature ids
overlap_primer_ids = overlap_primer_ids.union(primer_ids)
if verbose:
print "... %d primers overlap CDS features (%.3fs)" %\
(len(overlap_primer_ids), time.time() - t0)
for i in overlap_primer_ids:
aux[i].cds_overlap = True
# Filter primers on the basis of GC content at 3` end
def filter_primers_gc_3prime(gd, verbose):
""" Loops over the primer pairs in the passed GenomeData object and,
if either primer has more than 2 G+C in the last five nucleotides,
sets the .gc3primevalid flag to False.
"""
if verbose:
t0 = time.time()
print "Filtering %s primers on 3` GC content ..." % gd.name
invalidcount = 0
for primer in gd.primers.values():
fseq, rseq = primer.forward_seq[-5:], primer.reverse_seq[-5:]
if (fseq.count('C') + fseq.count('G') > 2) or \
(rseq.count('C') + fseq.count('G') > 2):
primer.gc3primevalid = False
invalidcount += 1
if verbose:
print "... %d primers failed (%.3fs)" % (invalidcount, time.time()-t0)
# Filter primers on the basis of internal oligo characteristics
def filter_primers_oligo(gd, verbose):
""" Loops over the primer pairs in the passed GenomeData object and,
mark the primer.oligovalid as False if the internal oligo corresponds
to any of the following criteria:
- G at 5` end or 3` end
- two or more counts of 'CC'
- G in second position at 5` end
"""
if verbose:
t0 = time.time()
print "Filtering %s primers on internal oligo characteristics ..." %\
gd.name
invalidcount = 0
for primer in gd.primers.values():
if (primer.oligo.seq.startswith('G') or \
primer.oligo.seq.endswith('G') or \
primer.oligo.seq[1:-1].count('CC') > 1 or \
primer.oligo.seq[1] == 'G'):
primer.oligovalid = False
invalidcount += 1
if verbose:
print "... %d primers failed (%.3fs)" % (invalidcount, time.time()-t0)
# Screen passed GenomeData primers against BLAST database
def blast_screen(gdlist, blast_exe, blastdb, cpus, sge, verbose):
""" The BLAST screen takes three stages. Firstly we construct a FASTA
sequence file containing all primer forward and reverse sequences,
for all primers in each GenomeData object of the list.
We then use the local BLAST+ (not legacy BLAST) interface to BLASTN to
query the named database with the input file. The multiprocessing
of BLASTN is handled by either our multiprocessing threading approach,
or by SGE; we don't use the built-in threading of BLAST so that we
retain flexibility when moving to SGE. It's a small modification to
revert to using the BLAST multithreading code. The output file is
named according to the GenomeData object.
The final step is to parse the BLAST output, and label the primers
that make hits as not having passed the BLAST filter.
"""
build_blast_input(gdlist, verbose)
run_blast(gdlist, blast_exe, blastdb, cpus, sge, verbose)
parse_blast(gdlist, cpus, verbose)
# Write BLAST input files for each GenomeData object
def build_blast_input(gdlist, verbose):
""" Loops over each GenomeData object in the list, and writes forward
and reverse primer sequences out in FASTA format to a file with
filename derived from the GenomeData object name.
"""
if verbose:
t0 = time.time()
print "Writing files for BLAST input ..."
for gd in gdlist:
gd.blastinfilename = os.path.join(os.path.split(gd.seqfilename)[0],
"%s_BLAST_input.fas" % gd.name)
seqrecords = []
for name, primer in gd.primers.items():
seqrecords.append(SeqRecord(Seq(primer.forward_seq),
id = name + '_forward'))
seqrecords.append(SeqRecord(Seq(primer.reverse_seq),
id = name + '_reverse'))
if verbose:
print "... writing %s ..." % gd.blastinfilename
SeqIO.write(seqrecords,
open(gd.blastinfilename, 'w'),
'fasta')
if verbose:
print "... done (%.3fs)" % (time.time() - t0)
# Run BLAST screen for each GenomeData object
def run_blast(gdlist, blast_exe, blastdb, poolsize, sge, verbose):
""" Loop over the GenomeData objects in the passed list, and run a
suitable BLASTN query with the primer sequences, writing to a file
with name derived from the GenomeData object, in XML format.
"""
if verbose:
t0 = time.time()
print "Compiling BLASTN command-lines ..."
clines = []
for gd in gdlist:
gd.blastoutfilename = os.path.join(os.path.split(gd.seqfilename)[0],
"%s_BLAST_output.xml" % gd.name)
clines.append(NcbiblastnCommandline(query=gd.blastinfilename,
db=blastdb,
task='blastn', # default: MEGABLAST
out=gd.blastoutfilename,
num_alignments=1,
num_descriptions=1,
outfmt=5,
perc_identity=90,
ungapped=True))
if verbose:
print "... BLASTN+ jobs to run:"
for cline in clines:
print cline
if not sge:
multiprocessing_run(clines, poolsize, verbose)
else:
sge_run(clines, verbose)
# Parse BLAST output for each GenomeData object
def parse_blast(gdlist, poolsize, verbose):
""" Loop over the GenomeData objects in the passed list, and parse the
BLAST XML output indicated in the .blastoutfilename attribute.
For each query that makes a suitable match, mark the appropriate
primer's .blastpass attribute as False
"""
if verbose:
t0 = time.time()
print "Parsing BLASTN output with multiprocessing ..."
# Here I'm cheating a bit and using multiprocessing directly so that
# we can speed up the parsing process a bit
pool = multiprocessing.Pool(processes=poolsize)
pool_results = [pool.apply_async(process_blastxml,
(gd.blastoutfilename, gd.name, verbose)) \
for gd in gdlist]
pool.close()
pool.join()
# Process the results returned from the BLAST searches. Create a