-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcovid.py
115 lines (89 loc) · 3.96 KB
/
covid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
import numpy
import urllib.request
import os
import json
from oger.ctrl.router import Router, PipelineServer
VOCABULARY = "CHEBI CL GO_BP GO_CC GO_MF MOP NCBITaxon PR SO UBERON"
VOCABULARIES = VOCABULARY.split()
PMID_URL = 'https://www.ncbi.nlm.nih.gov/research/coronavirus-api/export?'
def get_pmids(outpath='data/ids/'):
with open('bad_pmids.txt') as f:
BAD_PMIDS = [str(pmid).strip() for pmid in f.read().split(', ')]
tsv_output = os.path.join(outpath, 'all_pmids.tsv')
txt_output = os.path.join(outpath, 'all_pmids.txt')
urllib.request.urlretrieve(PMID_URL, tsv_output)
dataf = pd.read_csv(tsv_output, sep='\t', comment='#')
dataf = dataf['pmid'][~dataf['pmid'].isin(BAD_PMIDS)]
dataf.to_csv(txt_output, sep=' ', index=False, header=False)
def pmcods_to_txt(inpath='data/ids/PMID-PMCID_15062020.ods',
old='data/ids/old_pmcids.txt'):
newf = pd.read_excel(inpath, engine="odf")
newf = newf[['PMCID']]
newf['PMCID'].replace("", numpy.nan, inplace=True)
newf.dropna(subset=['PMCID'], inplace=True)
newf['PMCID'] = newf['PMCID'].str.slice(3)
outpath = os.path.join(os.path.dirname(inpath), 'pmcids.txt')
newf['PMCID'].to_csv(outpath, index=False, header=False)
oldf = pd.read_csv(old, header=None, names=["PMCID"])
news = set(newf['PMCID'].astype(int))
olds = set(oldf['PMCID'])
diffs = news.difference(olds)
# TODO: remove bad_pmcids
outpath = os.path.join(os.path.dirname(inpath), 'new_pmcids.txt')
with open(outpath, "w") as g:
g.write("\n".join(str(item) for item in diffs))
def pmctsv_to_txt(inpath):
dataf = pd.read_csv(inpath,header=0,delimiter='\t')
dataf['PMCID'] = dataf['PMCID'].str.slice(3)
dataf['PMCID'].replace("", numpy.nan, inplace=True)
dataf.dropna(subset=['PMCID'], inplace=True)
# this article is somehow not available on PMC
dataf = dataf[dataf['PMCID'] != '7068758']
outpath = os.path.join(os.path.dirname(inpath),'pmcids.txt')
dataf['PMCID'].to_csv(outpath, index=False, header=False)
def conll_collection_to_jsons(inpath='data/merged/collection.conll',
outpath='data/pubannotation',
sourcedb='pubmed'):
pl = PipelineServer(Router())
collection = pl.load_one(inpath, 'conll')
for document in collection:
title = document[0].text
pmid = document.id_
if not os.path.exists(outpath):
os.makedirs(outpath)
outfile = os.path.join(outpath, pmid + '.json')
with open(outfile, 'w', encoding='utf8') as g:
pl.write(document, 'pubanno_json', g)
with open(outfile,'r+', encoding='utf8') as g:
bad_json = json.load(g)
bad_json['sourcedb'] = sourcedb
good_json = bad_json
g.truncate(0)
g.seek(0)
json.dump(good_json, g)
def get_naked_conll(inpath='oger/collection.conll',
outpath='data/collection.naked.conll'):
pl = PipelineServer()
coll = pl.load_one(inpath, 'conll')
for s in coll.get_subelements('sentence'):
s.entities.clear()
with open(outpath, 'w') as f:
pl.write(coll, 'conll', f, conll_include='docid offsets')
def conll_collection_to_txts(inpath='data/merged/collection.conll',
outpath='data/public/txt'):
pl = PipelineServer(Router())
collection = pl.load_one(inpath, 'conll')
for document in collection:
pmid = document.id_
if not os.path.exists(outpath):
os.makedirs(outpath)
outfile = os.path.join(outpath, pmid + '.txt')
with open(outfile, 'w', encoding='utf8') as g:
pl.write(document, 'txt', g)
def bioc_to_brat(inpath='data/merged/collection.bioc.json',
outpath='data/merged/brat'):
pl = PipelineServer()
coll = pl.load_one(inpath, "bioc_json")
for doc in coll:
pl.export(doc, output_directory=outpath, export_format='brat')