-
Notifications
You must be signed in to change notification settings - Fork 1
/
miRNA_Target_Prediction.py
154 lines (137 loc) · 5.5 KB
/
miRNA_Target_Prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import re
import time
import sys
import os
import pandas as pd
import requests
import random
def getArgvDict(argv):
optionDict = {}
c = 0
for i in argv:
if re.match(r'-\D', i):
optionDict[i] = argv[c + 1]
c += 1
return optionDict
def ScrapyTarScan(url, regEx):
time.sleep(random.uniform(2.2,4.4))
response = requests.get(url)
targetscanFile = response.content
targetscanResPage = targetscanFile.decode('utf-8')
regObj = re.compile(regEx, re.DOTALL)
match = regObj.search(targetscanResPage)
if match:
scrapyRes = match.group(1)
return scrapyRes
else:
return 'Nothing'
def GetTargetScan(miR, out, threshold):
baseUrl1 = 'http://www.targetscan.org/cgi-bin/targetscan/vert_72/'
baseUrl2 = 'http://www.targetscan.org'
searchItem = 'targetscan.cgi?species=Human&gid=&mir_sc=&mir_c=&mir_nc=&mir_vnc=&mirg='
regEx1 = '<A HREF="(.*?)">Download'
regEx2 = '<A HREF="(.*?)" download'
regEx3 = '(.*)'
targetscanUrl = baseUrl1 + searchItem + miR
downloadItem = ScrapyTarScan(targetscanUrl, regEx1)
if 'Nothing' == downloadItem:
print('Warning: This miRNA('+miR+') is not exists in TargetScan v7.2, you may need check this out mannully.')
geneList = []
else:
downloadUrl = baseUrl1 + downloadItem
textItem = ScrapyTarScan(downloadUrl, regEx2)
textUrl = baseUrl2 + textItem
targetRes = ScrapyTarScan(textUrl, regEx3)
t = open(out + 'TargetScan_v7.2/TMP', 'w')
t.write(targetRes)
t.close()
oriTSC = pd.read_table(out + 'TargetScan_v7.2/TMP', header = 0, sep = '\t')
tmp1 = oriTSC[oriTSC['Representative miRNA'] == miR]
tmp2 = tmp1[tmp1['Cumulative weighted context++ score'] <= float(threshold)]
geneList = tmp2['Target gene'].values
tmp2.to_csv(out + 'TargetScan_v7.2/TarScan-' + miR + '.tsv', index = False, header = True, sep = '\t')
os.remove(outDir + 'TargetScan_v7.2/TMP')
return geneList
def GetMiRDB(miR, RDB, out, threshold):
oriRDB = pd.read_table(RDB, header = 0, sep = '\t')
tmp1 = oriRDB[oriRDB['miRNA'] == miR]
if len(tmp1) <= 1:
print('Warning: This miRNA('+miR+') is not exists in miRDB v6.0, you may need check this out mannully.')
geneList = []
else:
tmp2 = tmp1[tmp1['score'] >= int(threshold)]
geneList = tmp2['Gene'].values
tmp2.to_csv(out + 'miRDB_v6.0/miRDB_NM-' + miR + '.tsv', index = False, header = True, sep = '\t')
return geneList
def GetFinalTarget(miR, tsc, rdb, out):
clipdf = pd.read_table('./data/GSE111432_AGO2CLIP_gene.txt', header = None, names=['SYMBOL'], sep = '\t')
clipSymbol = clipdf['SYMBOL'].values
with open('./data/miRDB_v6.0.hsaOnly.RefSeq2SYMBOL.txt', 'r') as f:
cliptxt = f.read()
rdb_symbol = []
for i in rdb:
match = re.search(i+r'\t(\S+)\n', cliptxt)
if match:
rdb_symbol.append(match.group(1))
TSC_b_RDB = set(tsc).union(set(rdb_symbol))
TSC_b_RDB_j_CLIP = TSC_b_RDB.intersection(set(clipSymbol))
clip = [s for s in TSC_b_RDB_j_CLIP if s != '']
t = open(out+'final_targets/Targets-' + miR + '.tsv', 'w')
t.writelines("\n".join(TSC_b_RDB_j_CLIP))
t.close()
if __name__ == '__main__':
helpDoc = '''
-H/-h Show this help message and exit.
-miRNA The miRNAs list file , one miRNA per line. Please use the
complete miRNA name, e.g. hsa-miR-218-5p.
[-tarScanScore] Optional. The threshold of cumulative weighted
context++ score in TargetScan-V7.2 to filter results. The
predicted target with the score great than this threshold
will be filtered out. The value range is [-1, 0] (-1 means
best prediction). The defult value is -0.2.
[-miRDBScore] Optional. The threshold for prediction confidence
in miRDB-V6.0 to filter results. The predicted target with
the score lower than this threshold will be filtered out.
The value range is [50, 100] (100 means best prediction).
The defult value is 60.
-outputDir A folder will be used to store results.
The script will output the following folder/files into -outputDir:
TargetScan_v7.2/TarScan-miRNA.tsv, TargetScan results.
miRDB_v6.0/miRDB_NM-miRNA.tsv, miRDB results.
final_targets/Targets-miRNA.tsv, the union of TargetScan and
miRDB results, and filterd by AGO2 CLIP-seq data (osteoblast).
'''
argv = sys.argv
if re.search('-H|-h', str(argv)):
print(helpDoc)
sys.exit()
try:
envDict = getArgvDict(argv)
miRFile = envDict['-miRNA']
outDir = envDict['-outputDir']
except KeyError as e:
print('ERROR: Incomplete or invalid arg '+str(e)+' ! Please check your input, or use -H/-h flag to get help.')
sys.exit()
else:
tarScanScore = (envDict['-tarScanScore'] if envDict.get('-tarScanScore') else -0.2)
miRDBScore = (envDict['-miRDBScore'] if envDict.get('-miRDBScore') else 60)
if not os.path.exists(outDir):
os.makedirs(outDir)
outDir = (outDir if outDir[-1] == '/' else outDir + '/')
if not os.path.exists(outDir+'TargetScan_v7.2'):
os.makedirs(outDir+'TargetScan_v7.2')
if not os.path.exists(outDir+'miRDB_v6.0'):
os.makedirs(outDir+'miRDB_v6.0')
if not os.path.exists(outDir+'final_targets'):
os.makedirs(outDir+'final_targets')
miRDBFile = './data/miRDB_v6.0_hsaOnly.txt'
with open(miRFile, 'r') as f:
miRList = f.readlines()
for miR in miRList:
miR = miR.strip()
print('Predicting the targets of ' + str(miR) + '...')
tarListTSC = GetTargetScan(miR, outDir, tarScanScore)
tarListRDB = GetMiRDB(miR, miRDBFile, outDir, miRDBScore)
GetFinalTarget(miR, tarListTSC, tarListRDB, outDir)
localtime = time.asctime(time.localtime(time.time()))
print('Done. Finish time: '+ localtime + '\n')