forked from widdowquinn/find_differential_primers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
stitch_six_frame_stops.py
executable file
·200 lines (175 loc) · 6.93 KB
/
stitch_six_frame_stops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env python
#
# stitch_six_frame_stops.py
#
# Takes an input (multiple) FASTA sequence file, and replaces all runs of
# N with the sequence NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN, which
# contains start and stop codons in all frames. All the sequences in the
# input file are then stitched together with the same sequence.
#
# Overall, the effect is to replace all regions of base uncertainty with
# the insert sequence, forcing stops and starts for gene-calling, to avoid
# chimeras or frame-shift errors due to placement of Ns.
#
# We also convert other ambiguity codons to Ns
#
# This script is intended for use in assembly pipelines, where contigs are
# provided in the correct (or, at least, an acceptable) order.
#
# Copyright (C) 2011 Leighton Pritchard
#
# Contact:
#
# Leighton Pritchard,
# Information and Computing Sciences,
# James Hutton Institute,
# Errol Road,
# Invergowrie,
# Dundee,
# DD6 9LH,
# Scotland,
# UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
###
# IMPORTS
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from optparse import OptionParser
import matplotlib
import logging
import logging.handlers
import os
import re
import sys
import time
separator = 'NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN'
###
# FUNCTIONS
# Parse command-line
def parse_cmdline(args):
""" Parse command-line arguments
"""
Usage = "Usage: %prog [options] <organism_type> <infile>"
parser = OptionParser(Usage)
parser.add_option("-o", "--outfile", dest="outfilename",
action="store", default=None,
help="Output filename")
parser.add_option("-i", "--infile", dest="infilename",
action="store", default=None,
help="Input filename")
parser.add_option("-n", "--nrun", dest="nrun",
action="store", default=2,
help="Minimum number of Ns in substituted run (shorter runs not substituted)")
parser.add_option("--noambiguity", dest="noambiguity",
action="store_true", default=False,
help="Do not replace IUPAC ambiguity codes other than N with N")
parser.add_option("--id", dest="seqid",
action="store", default=None,
help="ID/Accession for the output stitched sequence")
parser.add_option("--nodesc", dest="nodesc",
action="store_true", default=False,
help="Suppress sequence description string")
parser.add_option("-v", "--verbose", dest="verbose",
action="store_true", default=False,
help="Give verbose output")
return parser.parse_args()
# Replace runs of N in each sequence with the separator
def stitch_ns(sequences, nrun):
""" Loop over each input sequence in sequences, and replace any
occurrences of N longer than nrun with the separator sequence.
"""
nrun = int(nrun) # Ensure we're dealing with an int
new_sequences = []
for s in sequences:
seqdata = str(s.seq)
ncount = seqdata.count('n') + seqdata.count('N')
logger.info("%d Ns in %s" % (ncount, s.id))
if 0 == ncount:
logger.info("Keeping unmodified %s" % s.id)
new_sequences.append(s)
continue
new_seq, repcount = re.subn('[nN]{%d,}' % nrun , separator, seqdata)
logger.info("Replaced %d runs of N runs >= %d in %s" % (repcount, nrun, s.id))
new_seqrecord = SeqRecord(Seq(new_seq.upper()), id=s.id + "_N-replaced",
name=s.name + "_N-replaced",
description=s.description)
logger.info("New SeqRecord created:\n%s" % new_seqrecord)
new_sequences.append(new_seqrecord)
return new_sequences
# Stitch passed sequences together with the separator
def stitch_seqs(sequences, seqid, nodesc, noambiguity):
""" Stitch the passed sequences together, giving the result the passed ID,
but a description that derives from each of the passed sequences
"""
new_seq = separator.join([str(s.seq) for s in sequences])
if noambiguity:
new_seq, repcount = re.subn('[^acgtACGTnN]', 'N', new_seq)
logger.info("Substituted %d ambiguity bases" % repcount)
new_id = seqid
new_name = seqid+"_stitched"
if nodesc:
new_desc = ''
else:
new_desc = '+'.join([s.id for s in sequences])
stitched_seq = SeqRecord(Seq(new_seq), id=new_id, name=new_name,
description=new_desc)
logger.info("Created stitched sequence (len:%d):\n%s" % \
(len(stitched_seq), stitched_seq))
return stitched_seq
###
# SCRIPT
if __name__ == '__main__':
# Parse command-line
# options are options, arguments are the .sff files
options, args = parse_cmdline(sys.argv)
# We set up logging, and modify loglevel according to whether we need
# verbosity or not
logger = logging.getLogger('stitch_six_frame_stops.py')
logger.setLevel(logging.DEBUG)
err_handler = logging.StreamHandler(sys.stderr)
err_formatter = \
logging.Formatter('%(levelname)s: %(message)s')
err_handler.setFormatter(err_formatter)
if options.verbose:
err_handler.setLevel(logging.INFO)
else:
err_handler.setLevel(logging.WARNING)
logger.addHandler(err_handler)
# Report arguments, if verbose
logger.info(options)
logger.info(args)
# Load data from file (STDIN if no filename provided)
if options.infilename is None:
inhandle = sys.stdin
logger.info("Using STDIN for input")
else:
inhandle = open(options.infilename, 'rU')
data = list(SeqIO.parse(inhandle, 'fasta'))
inhandle.close()
# Stitch individual sequences
data = stitch_ns(data, options.nrun)
if options.seqid is None:
options.seqid = '_'.join(data[0].description.split(' ')[1:])
# Stitch all sequences together
stitchedseq = stitch_seqs(data, options.seqid, options.nodesc, options.noambiguity)
# Write the stitched sequence to file (or STDOUT if no filename provided)
if options.outfilename is None:
outhandle = sys.stdout
else:
outhandle = open(options.outfilename, 'w')
SeqIO.write([stitchedseq], outhandle, 'fasta')
outhandle.close()