forked from stuppie/semmed-biolink
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsemmed_biolink_environment.py
61 lines (44 loc) · 2.13 KB
/
semmed_biolink_environment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), verbose=True)
# DATA points to the location of all (meta-)data files used in the data processing pipeline
DATA = os.getenv("DATA")
# See project README for more details about UMLS metadata file download and preparation
UMLS_VERSION = os.getenv("UMLS_VERSION")
# just taking the 'active' vocabularies; TODO: should the UMLS_PATH be further parameterized?
UMLS_PATH = DATA + UMLS_VERSION + "_Active/"
MRSAT_ARCHIVE = UMLS_PATH + "MRSAT.RRF.gz"
MRCONSO_ENG_ARCHIVE = UMLS_PATH + "MRCONSO_ENG.RRF.gz"
MRSTY_ARCHIVE = UMLS_PATH + "MRSTY.RRF.gz"
# The MetaMap semantic encodings
METAMAP_VERSION = os.getenv("METAMAP_VERSION")
SEMGROUPS = "https://metamap.nlm.nih.gov/Docs/SemGroups_"+METAMAP_VERSION+".txt"
SEMTYPES = "https://metamap.nlm.nih.gov/Docs/SemanticTypes_"+METAMAP_VERSION+"AA.txt"
# Original source data of Semantic Medline Database
# Generated by the script 'semmed_sql_to_csv.sh' script from SemMedDb SQL dumps
SEMEDDB_VERSION = os.getenv("SEMEDDB_VERSION")
SEMEDDB_PUBMED_RELEASE = os.getenv("SEMEDDB_PUBMED_RELEASE")
SEMMEDB_FILE_PREFIX = "semmedVER"+SEMEDDB_VERSION
# Name file format as of Release 4.0(?)
SEMMEDDB_PREDICATION_FILE = SEMMEDB_FILE_PREFIX+"_"+SEMEDDB_PUBMED_RELEASE+"_R_PREDICATION"
SEMMEDDB_PREDICATION_CSV = DATA+SEMMEDDB_PREDICATION_FILE+".csv"
# Pipeline of data files, don't really need to change between updates
EDGES1_TSV = DATA + "edges1.tsv"
EDGES2_TSV = DATA + "edges2.tsv"
EDGES3_TSV = DATA + "edges3.tsv"
EDGES4_TSV = DATA + "edges4.tsv"
NODES1_TSV = DATA + "nodes1.tsv"
NODES_BLM_TSV = DATA + "nodes_blm.tsv"
EDGES_FILTERED_TSV = DATA + "edges_filtered.tsv"
NODES_FILTERED_TSV = DATA + "nodes_filtered.tsv"
EDGES_BIOLINK_TSV = DATA + "edges_biolink.tsv"
NODES_BIOLINK_TSV = DATA + "nodes_biolink.tsv"
UNII_VERSION = os.getenv("UNII_VERSION")
UNII_RECORDS = DATA+"UNII_Records_"+UNII_VERSION+".txt"
XREFS_SHELVE = DATA+"xrefs.shelve"
UBERON_CSV = DATA+"uberon.csv"
DOID_CSV = DATA+"doid.csv"
MESH_XREFS_TSV = DATA+"mesh_xrefs.tsv"
NODES_XREF_TSV = DATA + "nodes_xref.tsv"
NODES_KGX_TSV = DATA+"nodes_kgx.tsv"
EDGES_KGX_TSV = DATA+"edges_kgx.tsv"