-
Notifications
You must be signed in to change notification settings - Fork 0
/
Pubmed term search for large datasets.py
48 lines (43 loc) · 1.81 KB
/
Pubmed term search for large datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
import time
import pandas as pd
from datetime import datetime
file = '/.../.../... .xlsx'
df = pd.read_excel(file).drop_duplicates()
base_url = 'https://pubmed.ncbi.nlm.nih.gov/?term='
if __name__ == "__main__":
df1 = df.to_dict('index')
# For completing partial files
# df1 = {i:df1[i] for i in df1 if type(df1[i]['pubmed results']) == float}
length = len(df)
for i in df1.keys():
print(f'{i}/{length}')
row = df1[i]
modsites = row['modsites'].split(':')
for modsite in modsites:
query = base_url+row['Gene name']+'+'+modsite
time.sleep(0.05)
try:
response = requests.get(query)
except TimeoutError:
time.sleep(30)
try:
response = requests.get(query)
except TimeoutError:
result = 'try again'
if response.status_code == 200:
if 'The following term was not found in PubMed' in response.text or 'Your search was processed without automatic term mapping because it retrieved zero results' in response.text:
print('False', query)
result = 'No results'
else:
print('True', query)
result = query
if result:
if 'pubmed results' in row.keys() and not type(row['pubmed results']) == float:
row['pubmed results'] = row['pubmed results']+'; ' + result
else:
row['pubmed results'] = result
else:
print(response.status_code, '\n')
today = datetime.now()
pd.DataFrame.from_dict(df1, orient='index').to_excel(file.split('.')[0] + f'{today[1]+today[2]+today[0]}.xlsx')