-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_sequences
34 lines (26 loc) · 1.15 KB
/
add_sequences
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import pandas as pd
from Bio import SeqIO
# add sequences to either promoters or non-promoters
input_gtf_path = 'path_to_your_file'
input_genome_path = 'path_to_your_fasta_file'
def add_seqs(input_genome_path, input_gtf_path):
column_names = ['Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Suppl']
df = pd.read_csv(input_gtf_path, sep='\t', header=None, names=column_names, dtype={'Start': int, 'End': int}, comment='#')
df['Seq'] = ""
genome_sequences = SeqIO.to_dict(SeqIO.parse(input_genome_path, "fasta"))
for index, row in df.iterrows():
seqid = row['Seqid']
strand = row['Strand']
start = row['Start'] - 1
end = row['End']
if strand == '+':
seq = str(genome_sequences[seqid].seq[start:end])
elif strand == '-':
seq = str(genome_sequences[seqid].seq[start:end].reverse_complement())
else:
seq = ""
df.at[index, 'Seq'] = seq
df.to_csv(input_gtf_path + '+rev_seqs.gtf', sep='\t', header=False, index=False)
print('File with sequences was saved to +seqs.gtf')
return df
add_seqs(input_genome_path, input_gtf_path)