Merge pull request #279 from TranslatorSRI/babel-1.6

This PR collects some changes that were necessary for the Translator "Fugu" release. * Upgraded: * Biolink Model: 4.1.6 -> 4.2.1 * UMLS version: 2023AB -> 2024AA * RxNorm version: 03042024 -> 07012024 * PANTHER sequence: PANTHER_Sequence_Classification_files/PTHR18.0_human -> PANTHER_Sequence_Classification_files/PTHR19.0_human * PANTHER pathways: SequenceAssociationPathway3.6.7 -> SequenceAssociationPathway3.6.8 * Added a manual Disease concord connecting MONDO:0005799 "hookworm infectious disease" and MESH:D006725 "Hookworm Infections" (as per NCATSTranslator/Tests#92) * Reduced the number of warnings generated by the Publications compendium generator, so that we only warn of a duplicate title for a publication if the subsequent title actually differs from the original one. * Added a check for the disease manual concord: an error is raised if a file does not have three tab-separated columns (such as if the tabs were silently converted into spaces). * Fixed bug in FRINK call, added improved error checking.
TranslatorSRI · Sep 23, 2024 · 02505bb · 02505bb
2 parents 716a784 + 3e2da10
commit 02505bb
Show file tree

Hide file tree

Showing 8 changed files with 28 additions and 12 deletions.
diff --git a/config.json b/config.json
@@ -4,9 +4,9 @@
   "intermediate_directory": "babel_outputs/intermediate",
   "output_directory": "babel_outputs",
 
-  "biolink_version": "4.1.6",
-  "umls_version": "2023AB",
-  "rxnorm_version": "03042024",
+  "biolink_version": "4.2.1",
+  "umls_version": "2024AA",
+  "rxnorm_version": "07012024",
 
   "ncbi_files": ["gene2ensembl.gz", "gene_info.gz", "gene_orthologs.gz", "gene_refseq_uniprotkb_collab.gz", "mim2gene_medgen"],
   "ubergraph_ontologies": ["UBERON", "CL", "GO", "NCIT", "ECO", "ECTO", "ENVO", "HP", "UPHENO","BFO","BSPO","CARO","CHEBI","CP","GOREL","IAO","MAXO","MONDO","PATO","PR","RO","UBPROP"],

diff --git a/input_data/manual_concords/disease.txt b/input_data/manual_concords/disease.txt
@@ -108,3 +108,6 @@ MONDO:0007079	oio:closeMatch	UMLS:C4268210
 MONDO:0007079	oio:closeMatch	UMLS:C3650363
 MONDO:0007079	oio:closeMatch	UMLS:C4536264
 
+# Not sure why MONDO:0005799 isn't connecting with MONDO:0005799, but we can connect it manually here.
+# See https://github.com/NCATSTranslator/Tests/issues/92
+MONDO:0005799	xref	MESH:D006725
diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py
@@ -107,8 +107,13 @@ def build_wikidata_cell_relationships(outdir):
           ?wd wdt:P2892 ?umls .
         }"""
     frink_wikidata_url = "https://frink.apps.renci.org/federation/sparql"
-    response = requests.post(frink_wikidata_url, params={'query': sparql})
-    results = response.json()
+    response = requests.post(frink_wikidata_url, data={'query': sparql})
+    if not response.ok:
+        raise RuntimeError(f"Could not query {frink_wikidata_url}: {response.status_code} {response.reason}")
+    try:
+        results = response.json()
+    except Exception as e:
+        raise RuntimeError(f"Could not parse {frink_wikidata_url}: {e} raised when parsing response {response.content}.")
     rows = results["results"]["bindings"]
     # If one wikidata entry has either more than one CL or more than one UMLS, then we end up with problems
     # (It could also be possible that the same CL is on more than one wikidata entry, but haven't seen that yet)

diff --git a/src/createcompendia/publications.py b/src/createcompendia/publications.py
@@ -199,8 +199,10 @@ def generate_compendium(concordances, identifiers, titles, publication_compendiu
             for line in titlef:
                 id, title = line.strip().split('\t')
                 if id in labels:
-                    logging.warning(
-                        f"Duplicate title for {id}: ignoring previous title '{labels[id]}', using new title '{title}'.")
+                    # Don't emit a warning unless the title has actually changed.
+                    if labels[id] != title:
+                        logging.warning(
+                            f"Duplicate title for {id}: ignoring previous title '{labels[id]}', using new title '{title}'.")
                 labels[id] = title
 
     # Write out the compendium.

diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py
@@ -3,7 +3,9 @@
 
 def pull_pantherfamily():
     outfile=f'{PANTHERFAMILY}/family.csv'
-    pull_via_ftp('ftp.pantherdb.org','/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/','PTHR18.0_human',outfilename=outfile)
+    pull_via_ftp('ftp.pantherdb.org','/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/','PTHR19.0_human',outfilename=outfile)
+    # If you need to check this quickly, it's also available on HTTP at:
+    # - http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/
 
 def pull_labels(infile,outfile):
     with open(infile,'r') as inf:

diff --git a/src/datahandlers/pantherpathways.py b/src/datahandlers/pantherpathways.py
@@ -2,7 +2,7 @@
 from src.babel_utils import pull_via_urllib
 
 def pull_panther_pathways():
-    pull_via_urllib('http://data.pantherdb.org/ftp/pathway/current_release/', 'SequenceAssociationPathway3.6.7.txt', decompress=False, subpath='PANTHER.PATHWAY')
+    pull_via_urllib('http://data.pantherdb.org/ftp/pathway/current_release/', 'SequenceAssociationPathway3.6.8.txt', decompress=False, subpath='PANTHER.PATHWAY')
 
 def make_pathway_labels(infile,outfile):
     with open(infile,'r') as inf:

diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
@@ -375,13 +375,13 @@ rule get_SMPDB_labels:
 
 rule get_panther_pathways:
     output:
-        outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
+        outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.8.txt'
     run:
         pantherpathways.pull_panther_pathways()
 
 rule get_panther_pathway_labels:
     input:
-        infile=config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
+        infile=config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.8.txt'
     output:
         labelfile=config['download_directory'] + '/PANTHER.PATHWAY/labels'
     run:

diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile
@@ -127,7 +127,11 @@ rule disease_manual_concord:
                 lstripped_line = line.lstrip()
                 if lstripped_line == '' or lstripped_line.startswith('#'):
                     continue
-                outp.writelines([lstripped_line])
+                # Make sure the line has three tab-delimited values, and fail otherwise.
+                elements = lstripped_line.split('\t')
+                if len(elements) != 3:
+                    raise RuntimeError(f"Found {len(elements)} elements on line {lstripped_line}, expected 3: {elements}")
+                outp.writelines(["\t".join(elements)])
 
 rule disease_compendia:
     input: