-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathgff3_postprocess.py
executable file
·116 lines (95 loc) · 3.74 KB
/
gff3_postprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
'''
GFF3 postprocessing
- Remove UTRs when two near genes are overlapped
Input: GFF3 file
Output: Postprocessed GFF3 file
Last updated: May 18, 2021
'''
import os
from argparse import ArgumentParser
from BCBio import GFF
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation
def main():
'''Main function'''
argparser_usage = (
'gff3_postprocess.py -g <genome_assembly> -i <input_gff3> -o '
'<output_gff3>')
parser = ArgumentParser(usage=argparser_usage)
parser.add_argument(
'-g', '--genome_assembly', nargs=1, required=True,
help='Genome assembly file in FASTA format')
parser.add_argument(
'-i', '--input_gff3', nargs=1, required=True,
help='Input GFF3 file')
parser.add_argument(
'-o', '--output_gff3', nargs=1, required=True,
help='Output GFF3 file name')
args = parser.parse_args()
genome_assembly = os.path.abspath(args.genome_assembly[0])
input_gff3 = os.path.abspath(args.input_gff3[0])
output_gff3 = os.path.abspath(args.output_gff3[0])
# Create necessary dirs
gff3_postprocess(genome_assembly, input_gff3, output_gff3)
def import_file(input_file):
'''Import file'''
with open(input_file) as f_in:
txt = list(line.rstrip() for line in f_in)
return txt
def gff3_postprocess(genome_assembly, input_gff3, output_gff3):
'''GFF3 post-processing'''
def update_g_features(gene_i):
g_feature = g_features[gene_i]
m_feature = g_feature.sub_features[0]
c_features = [
x for x in m_feature.sub_features if x.type == 'CDS']
c_features_s = sorted(
c_features, key=lambda x: x.location.start)
cds_start = c_features_s[0].location.start
cds_end = c_features_s[-1].location.end
e_features = [
x for x in m_feature.sub_features if x.type == 'exon']
e_features_s = sorted(
e_features, key=lambda x: x.location.start)
for i in range(len(c_features)):
e_features_s[i].location = FeatureLocation(
c_features_s[i].location.start, c_features_s[i].location.end,
strand=c_features_s[i].location.strand)
m_feature.location = FeatureLocation(
cds_start, cds_end, m_feature.location.strand)
m_feature.sub_features = (
e_features_s[0:len(c_features_s)] + c_features_s)
g_features[gene_i].location = FeatureLocation(
cds_start, cds_end, strand=g_features[gene_i].location.strand)
g_features[gene_i].sub_features = [m_feature]
d_fna = SeqIO.to_dict(SeqIO.parse(genome_assembly, 'fasta'))
d_scaffold = {}
scaffold_i = 0
genome_assembly_txt = import_file(genome_assembly)
for line in genome_assembly_txt:
if not line.startswith('>'):
continue
scaffold_name = line.split(' ')[0].replace('>', '')
d_scaffold[scaffold_name] = scaffold_i
scaffold_i += 1
gff_iter = GFF.parse(input_gff3, d_fna)
gff_iter_s = sorted(list(gff_iter), key=lambda x: d_scaffold[x.id])
my_records = []
for gff_element in gff_iter_s:
g_features = gff_element.features # Genes in a scaffold
gene_i = 0
while gene_i < len(g_features) - 1:
g_feature = g_features[gene_i]
g_feature_next = g_features[gene_i + 1]
end = g_feature.location.end
start_next = g_feature_next.location.start + 1
if end >= start_next:
update_g_features(gene_i)
update_g_features(gene_i + 1)
gene_i += 1
gff_element.features = g_features
my_records.append(gff_element)
GFF.write(my_records, open(output_gff3, 'w'))
if __name__ == '__main__':
main()