Skip to content

Commit

Permalink
Merge pull request #279 from TranslatorSRI/babel-1.6
Browse files Browse the repository at this point in the history
This PR collects some changes that were necessary for the Translator "Fugu" release.
* Upgraded:
  * Biolink Model: 4.1.6 -> 4.2.1
  * UMLS version: 2023AB -> 2024AA
  * RxNorm version: 03042024 -> 07012024
  * PANTHER sequence: PANTHER_Sequence_Classification_files/PTHR18.0_human -> PANTHER_Sequence_Classification_files/PTHR19.0_human
  * PANTHER pathways: SequenceAssociationPathway3.6.7 -> SequenceAssociationPathway3.6.8
* Added a manual Disease concord connecting MONDO:0005799 "hookworm infectious disease" and MESH:D006725 "Hookworm Infections" (as per NCATSTranslator/Tests#92)
* Reduced the number of warnings generated by the Publications compendium generator, so that we only warn of a duplicate title for a publication if the subsequent title actually differs from the original one.
* Added a check for the disease manual concord: an error is raised if a file does not have three tab-separated columns (such as if the tabs were silently converted into spaces).
* Fixed bug in FRINK call, added improved error checking.
  • Loading branch information
gaurav authored Sep 23, 2024
2 parents 716a784 + 3e2da10 commit 02505bb
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 12 deletions.
6 changes: 3 additions & 3 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
"intermediate_directory": "babel_outputs/intermediate",
"output_directory": "babel_outputs",

"biolink_version": "4.1.6",
"umls_version": "2023AB",
"rxnorm_version": "03042024",
"biolink_version": "4.2.1",
"umls_version": "2024AA",
"rxnorm_version": "07012024",

"ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
"ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],
Expand Down
3 changes: 3 additions & 0 deletions input_data/manual_concords/disease.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,6 @@ MONDO:0007079 oio:closeMatch UMLS:C4268210
MONDO:0007079 oio:closeMatch UMLS:C3650363
MONDO:0007079 oio:closeMatch UMLS:C4536264

# Not sure why MONDO:0005799 isn't connecting with MONDO:0005799, but we can connect it manually here.
# See https://github.com/NCATSTranslator/Tests/issues/92
MONDO:0005799 xref MESH:D006725
9 changes: 7 additions & 2 deletions src/createcompendia/anatomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,13 @@ def build_wikidata_cell_relationships(outdir):
?wd wdt:P2892 ?umls .
}"""
frink_wikidata_url = "https://frink.apps.renci.org/federation/sparql"
response = requests.post(frink_wikidata_url, params={'query': sparql})
results = response.json()
response = requests.post(frink_wikidata_url, data={'query': sparql})
if not response.ok:
raise RuntimeError(f"Could not query {frink_wikidata_url}: {response.status_code} {response.reason}")
try:
results = response.json()
except Exception as e:
raise RuntimeError(f"Could not parse {frink_wikidata_url}: {e} raised when parsing response {response.content}.")
rows = results["results"]["bindings"]
# If one wikidata entry has either more than one CL or more than one UMLS, then we end up with problems
# (It could also be possible that the same CL is on more than one wikidata entry, but haven't seen that yet)
Expand Down
6 changes: 4 additions & 2 deletions src/createcompendia/publications.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,10 @@ def generate_compendium(concordances, identifiers, titles, publication_compendiu
for line in titlef:
id, title = line.strip().split('\t')
if id in labels:
logging.warning(
f"Duplicate title for {id}: ignoring previous title '{labels[id]}', using new title '{title}'.")
# Don't emit a warning unless the title has actually changed.
if labels[id] != title:
logging.warning(
f"Duplicate title for {id}: ignoring previous title '{labels[id]}', using new title '{title}'.")
labels[id] = title

# Write out the compendium.
Expand Down
4 changes: 3 additions & 1 deletion src/datahandlers/pantherfamily.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

def pull_pantherfamily():
outfile=f'{PANTHERFAMILY}/family.csv'
pull_via_ftp('ftp.pantherdb.org','/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/','PTHR18.0_human',outfilename=outfile)
pull_via_ftp('ftp.pantherdb.org','/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/','PTHR19.0_human',outfilename=outfile)
# If you need to check this quickly, it's also available on HTTP at:
# - http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/

def pull_labels(infile,outfile):
with open(infile,'r') as inf:
Expand Down
2 changes: 1 addition & 1 deletion src/datahandlers/pantherpathways.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from src.babel_utils import pull_via_urllib

def pull_panther_pathways():
pull_via_urllib('http://data.pantherdb.org/ftp/pathway/current_release/', 'SequenceAssociationPathway3.6.7.txt', decompress=False, subpath='PANTHER.PATHWAY')
pull_via_urllib('http://data.pantherdb.org/ftp/pathway/current_release/', 'SequenceAssociationPathway3.6.8.txt', decompress=False, subpath='PANTHER.PATHWAY')

def make_pathway_labels(infile,outfile):
with open(infile,'r') as inf:
Expand Down
4 changes: 2 additions & 2 deletions src/snakefiles/datacollect.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -375,13 +375,13 @@ rule get_SMPDB_labels:

rule get_panther_pathways:
output:
outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.8.txt'
run:
pantherpathways.pull_panther_pathways()

rule get_panther_pathway_labels:
input:
infile=config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
infile=config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.8.txt'
output:
labelfile=config['download_directory'] + '/PANTHER.PATHWAY/labels'
run:
Expand Down
6 changes: 5 additions & 1 deletion src/snakefiles/diseasephenotype.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,11 @@ rule disease_manual_concord:
lstripped_line = line.lstrip()
if lstripped_line == '' or lstripped_line.startswith('#'):
continue
outp.writelines([lstripped_line])
# Make sure the line has three tab-delimited values, and fail otherwise.
elements = lstripped_line.split('\t')
if len(elements) != 3:
raise RuntimeError(f"Found {len(elements)} elements on line {lstripped_line}, expected 3: {elements}")
outp.writelines(["\t".join(elements)])

rule disease_compendia:
input:
Expand Down

0 comments on commit 02505bb

Please sign in to comment.