diff --git a/data/templateGroups.json b/data/templateGroups.json index adc7591a..237813ea 100644 --- a/data/templateGroups.json +++ b/data/templateGroups.json @@ -21,6 +21,7 @@ ], "predicate": ["affects"], "qualifiers": { + "qualified_predicate": "causes", "object_aspect_qualifier": "activity_or_abundance", "object_direction_qualifier": "increased" }, @@ -40,6 +41,7 @@ ], "predicate": ["affects"], "qualifiers": { + "qualified_predicate": "causes", "object_aspect_qualifier": "activity_or_abundance", "object_direction_qualifier": "decreased" }, diff --git a/src/edge_manager.ts b/src/edge_manager.ts index 543a6ea5..dabacb86 100644 --- a/src/edge_manager.ts +++ b/src/edge_manager.ts @@ -207,11 +207,8 @@ export default class QueryEdgeManager { const objectIDs = [record.object.original, record.object.curie, ...record.object.equivalentCuries]; // there must be at least a minimal intersection - const subjectMatch = - subjectIDs.some((curie) => execSubjectCuries.includes(curie)) || execSubjectCuries.length === 0; - const objectMatch = objectIDs.some((curie) => execObjectCuries.includes(curie)) || execObjectCuries.length === 0; - - //if both ends match then keep record + const subjectMatch = subjectIDs.some((curie) => execSubjectCuries.includes(curie)); + const objectMatch = objectIDs.some((curie) => execObjectCuries.includes(curie)); // Don't keep self-edges const selfEdge = [...subjectIDs].some((curie) => objectIDs.includes(curie)); diff --git a/src/graph/kg_node.ts b/src/graph/kg_node.ts index 60dac19f..ff338c5d 100644 --- a/src/graph/kg_node.ts +++ b/src/graph/kg_node.ts @@ -8,12 +8,14 @@ export interface KGNodeInfo { curies: string[]; primaryCurie: string; qNodeID: string; + originalCurie?: string; } export default class KGNode { id: string; primaryCurie: string; qNodeID: string; + originalCurie: string; curies: string[]; names: string[]; semanticType: string[]; @@ -36,6 +38,9 @@ export default class KGNode { this.targetNodes = new Set(); this.sourceQNodeIDs = new Set(); this.targetQNodeIDs = new Set(); + + // store original curie to output `query_id bte#815` + this.originalCurie = info.originalCurie; } addSourceNode(kgNodeID: string): void { diff --git a/src/graph/knowledge_graph.ts b/src/graph/knowledge_graph.ts index 58b621cb..1a28e9ca 100644 --- a/src/graph/knowledge_graph.ts +++ b/src/graph/knowledge_graph.ts @@ -14,10 +14,42 @@ import KGNode from './kg_node'; import KGEdge from './kg_edge'; import { BTEGraphUpdate } from './graph'; import { APIDefinition } from '@biothings-explorer/types'; +import { Telemetry } from '@biothings-explorer/utils'; const debug = Debug('bte:biothings-explorer-trapi:KnowledgeGraph'); -const NON_ARRAY_ATTRIBUTES = ['biolink:knowledge_level', 'biolink:agent_type']; +const NON_ARRAY_ATTRIBUTES = ['biolink:knowledge_level', 'biolink:agent_type', 'biolink:evidence_count']; + +interface SpecialAttributeHandlers { + [attribute_type_id: string]: (value: Set, kgEdge: KGEdge) => TrapiAttribute['value']; +} + +const SPECIAL_ATTRIBUTE_HANDLERS: SpecialAttributeHandlers = { + 'biolink:max_research_phase': (value, kgEdge) => { + // Special handling for max research phase + const phase_map = { + '-1.0': 'not_provided', + '0.5': 'pre_clinical_research_phase', + '1.0': 'clinical_trial_phase_1', + '2.0': 'clinical_trial_phase_2', + '3.0': 'clinical_trial_phase_3', + '4.0': 'clinical_trial_phase_4', + }; + function map_phase(val: string) { + let new_val = phase_map[val]; + if (typeof new_val !== 'undefined') return new_val; + + const source = Object.values(kgEdge.sources).find((src) => typeof src.primary_knowledge_source !== 'undefined') + .primary_knowledge_source.resource_id; + const err = new Error( + `Unrecognized research phase (${val}) from ${source} ${kgEdge.subject} > ${kgEdge.predicate} > ${kgEdge.object}`, + ); + Telemetry.captureException(err); + return 'not_provided'; + } + return Array.from(value as Set).map(map_phase); + }, +}; export default class KnowledgeGraph { nodes: { @@ -117,13 +149,19 @@ export default class KnowledgeGraph { Object.entries(kgEdge.attributes).forEach(([key, value]) => { if (key === 'edge-attributes') return; - // if (key == 'edge-attributes') return; + + let formatted_value: TrapiAttribute['value'] = NON_ARRAY_ATTRIBUTES.includes(key) + ? Array.from(value as Set).reduce((acc, val) => acc + val) + : Array.from(value as Set); + + if (key in SPECIAL_ATTRIBUTE_HANDLERS) { + formatted_value = SPECIAL_ATTRIBUTE_HANDLERS[key](value as Set, kgEdge); + } + attributes.push({ attribute_type_id: key, - value: // technically works for numbers as well - NON_ARRAY_ATTRIBUTES.includes(key) - ? [...(value as Set)].reduce((acc, val) => acc + val) - : Array.from(value as Set), + // technically works for numbers as well + value: formatted_value, //value_type_id: 'bts:' + key, }); }); diff --git a/src/index.ts b/src/index.ts index 22164db9..09df0518 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,4 @@ -import MetaKG from '@biothings-explorer/smartapi-kg'; +import MetaKG, { SmartAPIQueryResult } from '@biothings-explorer/smartapi-kg'; import path from 'path'; import QueryGraph from './query_graph'; import KnowledgeGraph from './graph/knowledge_graph'; @@ -30,6 +30,7 @@ import { QueryHandlerOptions } from '@biothings-explorer/types'; import BTEGraph from './graph/graph'; import QEdge from './query_edge'; import { Telemetry } from '@biothings-explorer/utils'; +import { enrichTrapiResultsWithPfocrFigures } from './results_assembly/pfocr'; // Exports for external availability export * from './types'; @@ -73,14 +74,21 @@ export default class TRAPIQueryHandler { async findUnregisteredAPIs() { const configListAPIs = this.options.apiList['include']; - const smartapiRegistry = await fs.readFile(this.path, { encoding: 'utf8' }); + + let smartapiRegistry: SmartAPIQueryResult; + if (this.options.smartapi) { + smartapiRegistry = this.options.smartapi; + } else { + const file = await fs.readFile(this.path, "utf-8"); + smartapiRegistry = JSON.parse(file); + } const smartapiIds: string[] = []; const inforesIds: string[] = []; const unregisteredAPIs: string[] = []; // TODO typing for smartapiRegistration - JSON.parse(smartapiRegistry).hits.forEach((smartapiRegistration) => { + smartapiRegistry.hits.forEach((smartapiRegistration) => { smartapiIds.push(smartapiRegistration._id); inforesIds.push(smartapiRegistration.info?.['x-translator']?.infores); }); @@ -96,14 +104,23 @@ export default class TRAPIQueryHandler { return unregisteredAPIs; } - _loadMetaKG(): MetaKG { - const metaKG = new MetaKG(this.path, this.predicatePath); + async _loadMetaKG(): Promise { debug( `Query options are: ${JSON.stringify({ ...this.options, schema: this.options.schema ? this.options.schema.info.version : 'not included', + metakg: "", + smartapi: "" })}`, ); + + if (this.options.metakg) { + const metaKG = new MetaKG(undefined, undefined, (this.options as any).metakg); + metaKG.filterKG(this.options); + return metaKG; + } + + const metaKG = new MetaKG(this.path, this.predicatePath); debug(`SmartAPI Specs read from path: ${this.path}`); metaKG.constructMetaKGSync(this.includeReasoner, this.options); return metaKG; @@ -302,6 +319,18 @@ export default class TRAPIQueryHandler { this.finalizedResults = fixedResults; } + appendOriginalCuriesToResults(results: TrapiResult[]): void { + results.forEach(result => { + Object.entries(result.node_bindings).forEach(([_, bindings]) => { + bindings.forEach(binding => { + if (this.bteGraph.nodes[binding.id].originalCurie && this.bteGraph.nodes[binding.id].originalCurie !== binding.id) { + binding.query_id = this.bteGraph.nodes[binding.id].originalCurie; + } + }) + }) + }) + } + async addQueryNodes(): Promise { const qNodeIDsByOriginalID: Map = new Map(); const curiesToResolve = [ @@ -323,6 +352,7 @@ export default class TRAPIQueryHandler { this.bteGraph.nodes[resolvedEntity.primaryID] = new KGNode(resolvedEntity.primaryID, { primaryCurie: resolvedEntity.primaryID, qNodeID: qNodeIDsByOriginalID[originalCurie], + originalCurie: originalCurie, curies: resolvedEntity.equivalentIDs, names: resolvedEntity.labelAliases, semanticType: category ? [category] : ['biolink:NamedThing'], @@ -629,7 +659,7 @@ export default class TRAPIQueryHandler { const span1 = Telemetry.startSpan({ description: 'loadMetaKG' }); debug('Start to load metakg.'); - const metaKG = this._loadMetaKG(); + const metaKG = await this._loadMetaKG(); if (!metaKG.ops.length) { let error: string; if (this.options.smartAPIID) { @@ -710,8 +740,13 @@ export default class TRAPIQueryHandler { this.createSubclassSupportGraphs(); // prune bteGraph this.bteGraph.prune(this.finalizedResults, this.auxGraphs); + // add original curies to results + this.appendOriginalCuriesToResults(this.finalizedResults); this.bteGraph.notify(); + // Attempt to enrich results with PFOCR figures + this.logs = [...this.logs, ...(await enrichTrapiResultsWithPfocrFigures(this.getResponse()))]; + span3?.finish(); // check primary knowledge sources diff --git a/src/inferred_mode/inferred_mode.ts b/src/inferred_mode/inferred_mode.ts index a11edf54..3185405e 100644 --- a/src/inferred_mode/inferred_mode.ts +++ b/src/inferred_mode/inferred_mode.ts @@ -13,6 +13,7 @@ import { TrapiKnowledgeGraph, TrapiQEdge, TrapiQNode, + TrapiQualifier, TrapiQueryGraph, TrapiResponse, TrapiResult, @@ -173,10 +174,8 @@ export default class InferredQueryHandler { const qualifierConstraints = (qEdge.qualifier_constraints || []).map((qualifierSetObj) => { return Object.fromEntries( qualifierSetObj.qualifier_set.map(({ qualifier_type_id, qualifier_value }) => [ - qualifier_type_id.replace('biolink:', ''), - Array.isArray(qualifier_value) - ? qualifier_value.map((string) => string.replace('biolink:', '')) - : qualifier_value.replace('biolink:', ''), + qualifier_type_id, + qualifier_value, ]), ) as CompactQualifiers; }); @@ -219,7 +218,7 @@ export default class InferredQueryHandler { async createQueries(qEdge: TrapiQEdge, qSubject: TrapiQNode, qObject: TrapiQNode): Promise { const templates = await this.findTemplates(qEdge, qSubject, qObject); // combine creative query with templates - const subQueries = templates.map(({ template, queryGraph }) => { + const subQueries = templates.map(({ template, queryGraph, qualifiers }) => { queryGraph.nodes.creativeQuerySubject.categories = [ ...new Set([...queryGraph.nodes.creativeQuerySubject.categories, ...qSubject.categories]), ]; @@ -249,7 +248,7 @@ export default class InferredQueryHandler { delete queryGraph.nodes.creativeQueryObject.ids; } - return { template, queryGraph }; + return { template, queryGraph, qualifiers }; }); return subQueries; @@ -261,7 +260,8 @@ export default class InferredQueryHandler { qEdgeID: string, qEdge: TrapiQEdge, combinedResponse: CombinedResponse, - auxGraphSuffixes: {[inferredEdgeID: string]: number} + auxGraphSuffixes: {[inferredEdgeID: string]: number}, + qualifiers?: CompactQualifiers, ): CombinedResponseReport { const span = Telemetry.startSpan({ description: 'creativeCombineResponse' }); const newResponse = handler.getResponse(); @@ -331,8 +331,43 @@ export default class InferredQueryHandler { const resultID = `${resultCreativeSubjectID}-${resultCreativeObjectID}`; // Direct edge answers stand on their own, not as an inferred edge. - if (Object.keys(result.node_bindings).length == 2) { - const boundEdgeID = Object.values(result.analyses[0].edge_bindings)[0][0].id; + const boundEdgeID = Object.values(result.analyses[0].edge_bindings)[0][0].id; + const boundEdge = combinedResponse.message.knowledge_graph.edges[boundEdgeID]; + const oneHop = Object.keys(result.node_bindings).length === 2; // Direct edge + // Predicate matches or is descendant + const predicateMatch = + qEdge.predicates?.some( + (predicate) => + predicate === boundEdge.predicate || + biolink.getDescendantPredicates(predicate).includes(boundEdge.predicate), + ) ?? false; + // All query qualifiers (if any) are accounted for (more is fine) + const qualifierMatch = + !qEdge.qualifier_constraints || + qEdge.qualifier_constraints.length === 0 || + qEdge.qualifier_constraints?.some(({ qualifier_set }) => { + return qualifier_set.every((queryQualifier) => { + return ( + boundEdge.qualifiers?.some((qualifier) => { + const typeMatch = queryQualifier.qualifier_type_id === qualifier.qualifier_type_id; + let valueMatch: boolean; + try { + const descendants = queryQualifier.qualifier_value.includes('biolink:') + ? biolink.getDescendantPredicates(queryQualifier.qualifier_value as string) + : biolink.getDescendantQualifiers(queryQualifier.qualifier_value as string); + valueMatch = + queryQualifier.qualifier_value === qualifier.qualifier_value || + descendants.includes(qualifier.qualifier_value as string); + } catch (err) { + valueMatch = queryQualifier.qualifier_value === qualifier.qualifier_value; + } + return typeMatch && valueMatch; + }) ?? false + ); + }); + }); + const specialHandling = oneHop && predicateMatch && qualifierMatch; + if (specialHandling) { translatedResult.analyses[0].edge_bindings = { [qEdgeID]: [{ id: boundEdgeID, attributes: [] }] }; } else { // Create an aux graph using the result and associate it with an inferred Edge @@ -356,14 +391,28 @@ export default class InferredQueryHandler { ], attributes: [ { attribute_type_id: 'biolink:support_graphs', value: [] }, - { attribute_type_id: 'biolink:knowledge_level', value: "prediction" }, - { attribute_type_id: 'biolink:agent_type', value: "computational_model" }, + { attribute_type_id: 'biolink:knowledge_level', value: 'prediction' }, + { attribute_type_id: 'biolink:agent_type', value: 'computational_model' }, ], }; } if (!auxGraphSuffixes[inferredEdgeID]) auxGraphSuffixes[inferredEdgeID] = 0; const auxGraphID = `${inferredEdgeID}-support${auxGraphSuffixes[inferredEdgeID]}`; auxGraphSuffixes[inferredEdgeID]++; + // Add qualifiers to edge + if ( + typeof qualifiers == 'object' && + Object.keys(qualifiers).length > 0 && + !combinedResponse.message.knowledge_graph.edges[inferredEdgeID].qualifiers + ) { + combinedResponse.message.knowledge_graph.edges[inferredEdgeID].qualifiers = Object.entries(qualifiers).map( + ([qualifierType, qualifierValue]) => ({ + qualifier_type_id: qualifierType, + qualifier_value: qualifierValue, + }), + ); + } + (combinedResponse.message.knowledge_graph.edges[inferredEdgeID].attributes[0].value as string[]).push( auxGraphID, ); @@ -375,7 +424,7 @@ export default class InferredQueryHandler { }, [] as string[], ), - attributes: [] + attributes: [], }; if (this.pathfinder) { @@ -412,9 +461,9 @@ export default class InferredQueryHandler { if (typeof combinedResponse.message.results[resultID].analyses[0].score !== 'undefined') { combinedResponse.message.results[resultID].analyses[0].score = resScore ? scaled_sigmoid( - inverse_scaled_sigmoid(combinedResponse.message.results[resultID].analyses[0].score) + - inverse_scaled_sigmoid(resScore), - ) + inverse_scaled_sigmoid(combinedResponse.message.results[resultID].analyses[0].score) + + inverse_scaled_sigmoid(resScore), + ) : combinedResponse.message.results[resultID].analyses[0].score; } else { combinedResponse.message.results[resultID].analyses[0].score = resScore; @@ -550,7 +599,7 @@ export default class InferredQueryHandler { } = {}; const auxGraphSuffixes: {[inferredEdgeID: string]: number} = {}; - await async.eachOfSeries(subQueries, async ({ template, queryGraph }, i) => { + await async.eachOfSeries(subQueries, async ({ template, queryGraph, qualifiers }, i) => { const span = Telemetry.startSpan({ description: 'creativeTemplate' }); span.setData('template', (i as number) + 1); i = i as number; @@ -573,7 +622,8 @@ export default class InferredQueryHandler { qEdgeID, qEdge, combinedResponse, - auxGraphSuffixes + auxGraphSuffixes, + qualifiers, ); // update values used in logging successfulQueries += querySuccess; @@ -587,12 +637,10 @@ export default class InferredQueryHandler { stop = true; const message = [ `Addition of ${creativeLimitHit} results from Template ${i + 1}`, - creativeLimitHit === this.CREATIVE_LIMIT ? ' meets ' : ' exceeds ', - `creative result maximum of ${this.CREATIVE_LIMIT} (reaching ${ - creativeLimitHit + Object.keys(combinedResponse.message.results).length === this.CREATIVE_LIMIT ? ' meets ' : ' exceeds ', + `creative result maximum of ${this.CREATIVE_LIMIT} (reaching ${Object.keys(combinedResponse.message.results).length } merged). `, - `Response will be truncated to top-scoring ${this.CREATIVE_LIMIT} results. Skipping remaining ${ - subQueries.length - (i + 1) + `Response will be truncated to top-scoring ${this.CREATIVE_LIMIT} results. Skipping remaining ${subQueries.length - (i + 1) } `, subQueries.length - (i + 1) === 1 ? `template.` : `templates.`, ].join(''); @@ -617,9 +665,8 @@ export default class InferredQueryHandler { const total = Object.values(mergedResultsCount).reduce((sum, count) => sum + count, 0) + Object.keys(mergedResultsCount).length; - const message = `Merging Summary: (${total}) inferred-template results were merged into (${ - Object.keys(mergedResultsCount).length - }) final results, reducing result count by (${total - Object.keys(mergedResultsCount).length})`; + const message = `Merging Summary: (${total}) inferred-template results were merged into (${Object.keys(mergedResultsCount).length + }) final results, reducing result count by (${total - Object.keys(mergedResultsCount).length})`; debug(message); combinedResponse.logs.push(new LogEntry('INFO', null, message).getLog()); } diff --git a/src/inferred_mode/template_lookup.ts b/src/inferred_mode/template_lookup.ts index d5616723..0463fe0e 100644 --- a/src/inferred_mode/template_lookup.ts +++ b/src/inferred_mode/template_lookup.ts @@ -9,14 +9,13 @@ export interface TemplateLookup { subject: string; object: string; predicate: string; - qualifiers: { - [qualifierType: string]: string; - }; + qualifiers: CompactQualifiers; } export interface MatchedTemplate { template: string; queryGraph: TrapiQueryGraph; + qualifiers: CompactQualifiers; } export interface TemplateGroup { @@ -36,6 +35,11 @@ export interface CompactEdge { qualifiers: CompactQualifiers; } +interface PathMatch { + path: string; + qualifiers: CompactQualifiers; +} + export async function getTemplates(lookups: TemplateLookup[], pathfinder = false): Promise { async function getFiles(dir: string): Promise { const rootFiles = await fs.readdir(path.resolve(dir)); @@ -56,32 +60,38 @@ export async function getTemplates(lookups: TemplateLookup[], pathfinder = false const templateGroups = JSON.parse( await fs.readFile(path.resolve(__dirname, '../../data/templateGroups.json'), { encoding: 'utf8' }), ); - const matchingTemplatePaths: string[] = templateGroups.reduce((matches: string[], group: TemplateGroup) => { + const matchingTemplatePaths: PathMatch[] = templateGroups.reduce((matches: PathMatch[], group: TemplateGroup) => { + let matchingQualifers: CompactQualifiers; const lookupMatch = lookups.some((lookup) => { - return ( + const match = (!!group.pathfinder === pathfinder) && group.subject.includes(lookup.subject) && group.object.includes(lookup.object) && group.predicate.includes(lookup.predicate) && Object.entries(lookup.qualifiers || {}).every(([qualifierType, qualifierValue]) => { - return (group.qualifiers || {})[qualifierType] && group.qualifiers[qualifierType] === qualifierValue; - }) - ); + return ( + (group.qualifiers || {})[qualifierType.replace('biolink:', '')] && + group.qualifiers[qualifierType.replace('biolink:', '')] === qualifierValue.replace('biolink:', '') + ); + }); + if (match) matchingQualifers = lookup.qualifiers; + return match; }); if (lookupMatch) { group.templates.forEach((template) => { - if (!matches.includes(templatePaths[template])) { - matches.push(templatePaths[template]); + if (!matches.find((t) => t.path === templatePaths[template])) { + matches.push({ path: templatePaths[template], qualifiers: matchingQualifers }); } }); } return matches; }, [] as string[]); - return await async.map(matchingTemplatePaths, async (templatePath: string) => { + return await async.map(matchingTemplatePaths, async (templatePathObj: PathMatch) => { return { - template: templatePath.substring(templatePath.lastIndexOf('/') + 1), - queryGraph: JSON.parse(await fs.readFile(templatePath, { encoding: 'utf8' })).message.query_graph, + template: templatePathObj.path.substring(templatePathObj.path.lastIndexOf('/') + 1), + queryGraph: JSON.parse(await fs.readFile(templatePathObj.path, { encoding: 'utf8' })).message.query_graph, + qualifiers: templatePathObj.qualifiers, }; }); } diff --git a/src/results_assembly/pfocr.ts b/src/results_assembly/pfocr.ts index 7f68528c..222c220c 100644 --- a/src/results_assembly/pfocr.ts +++ b/src/results_assembly/pfocr.ts @@ -4,12 +4,16 @@ const debug = Debug('bte:biothings-explorer-trapi:pfocr'); import { intersection } from '../utils'; import _ from 'lodash'; import { LogEntry, StampedLog } from '@biothings-explorer/utils'; -import { TrapiResult } from '@biothings-explorer/types'; +import { TrapiResult, TrapiKGNode, TrapiResponse, TrapiKGEdge } from '@biothings-explorer/types'; +import Graph from '../graph/graph'; // the minimum acceptable intersection size between the CURIEs // in a TRAPI result and in a PFOCR figure. const MATCH_COUNT_MIN = 2; const FIGURE_COUNT_MAX = 20; +const SUPPORTED_PREFIXES = { + NCBIGene: 'associatedWith.mentions.genes.ncbigene', +}; interface pfocrQueryBody { q: string[]; @@ -28,6 +32,7 @@ interface FigureResult { notfound?: boolean; associatedWith: { figureUrl: string; + pfocrUrl: string; pmc: string; mentions: { genes: { @@ -59,14 +64,19 @@ async function getAllByScrolling( hits: RawFigureResult[] = [], ): Promise { queryBody.from = batchIndex; - const { data } = await axios.post(baseUrl, queryBody).catch((err) => { - debug('Error in scrolling request', err); - throw err; - }); + let data: { hits: RawFigureResult[]; max_total: number }; + try { + data = (await axios.post(baseUrl, queryBody, { timeout: 15000 })).data; + } catch (err) { + debug(`Error in scrolling request window ${batchIndex}-${batchIndex + 1000}, error is ${(err as Error).message}`); + } + + if (data) { + hits.push(...data.hits); + debug(`Batch window ${batchIndex}-${batchIndex + 1000}: ${data.hits.length} hits retrieved for PFOCR figure data`); + } - hits.push(...data.hits); - debug(`Batch window ${batchIndex}-${batchIndex + 1000}: ${data.hits.length} hits retrieved for PFOCR figure data`); - if (batchIndex + 1000 < data.max_total) { + if (data && batchIndex + 1000 < data.max_total) { return await getAllByScrolling(baseUrl, queryBody, batchIndex + 1000, hits); } else { return hits; @@ -77,7 +87,12 @@ async function getAllByScrolling( */ async function getPfocrFigures(qTerms: Set): Promise { debug(`Getting PFOCR figure data`); - const url = 'https://biothings.ncats.io/pfocr/query'; + const url = { + dev: 'https://biothings.ci.transltr.io/pfocr/query', + ci: 'https://biothings.ci.transltr.io/pfocr/query', + test: 'https://biothings.test.transltr.io/pfocr/query', + prod: 'https://biothings.ncats.io/pfocr/query', + }[process.env.INSTANCE_ENV ?? 'prod']; /* * We can now POST using minimum_should_match to bypass most set logic on our side * detailed here: https://github.com/biothings/pending.api/issues/88 @@ -89,7 +104,13 @@ async function getPfocrFigures(qTerms: Set): Promise): Promise { - const matchableQNodeIDs: Set = new Set(); - - if (allTrapiResults.length === 0) { - return matchableQNodeIDs; - } +function traverseResultForNodes(result: TrapiResult, response: TrapiResponse): Set { + const kg = response.message.knowledge_graph; + const nodes: Set = new Set(); + const edgeStack: TrapiKGEdge[] = []; + Object.values(result.node_bindings).forEach((bindings) => + bindings.forEach((binding) => nodes.add(kg.nodes[binding.id])), + ); + Object.values(result.analyses[0].edge_bindings).forEach((bindings) => + bindings.forEach((binding) => edgeStack.push(kg.edges[binding.id])), + ); - // TODO: this will need to be updated to handle non-NCBIGene CURIEs as well - // as non-gene CURIEs once we support querying for chemicals and diseases. - - const supportedPrefixes = new Set(['NCBIGene']); - for (const trapiResult of allTrapiResults) { - for (const [qNodeID, nodeBindingValues] of Object.entries(trapiResult.node_bindings)) { - for (const nodeBindingValue of nodeBindingValues) { - const prefix = nodeBindingValue.id.split(':')[0]; - if (supportedPrefixes.has(prefix)) { - matchableQNodeIDs.add(qNodeID); - break; - } - } + while (edgeStack.length > 0) { + const edge = edgeStack.pop(); + nodes.add(kg.nodes[edge.object]); + nodes.add(kg.nodes[edge.subject]); + const supportGraphs = edge.attributes.find((attribute) => attribute.attribute_type_id == 'biolink:support_graphs'); + if (supportGraphs) { + (supportGraphs.value as string[]).forEach((auxGraphID) => + response.message.auxiliary_graphs[auxGraphID].edges.forEach((edgeID) => edgeStack.push(kg.edges[edgeID])), + ); } } - debug(`QNode(s) having CURIEs that PFOCR could potentially match: ${[...matchableQNodeIDs]}`); - return matchableQNodeIDs; + return nodes; } /* time complexity: O(t*f) @@ -156,53 +176,66 @@ function getMatchableQNodeIDs(allTrapiResults: TrapiResult[]): Set { * t: trapiResults.length * f: figures.length */ -export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiResult[]): Promise { - const matchableQNodeIDs = getMatchableQNodeIDs(allTrapiResults); +export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse): Promise { + // NOTE: This function operates on the actual TRAPI information that will be returned + // to the client. Don't mutate what shouldn't be mutated! + const results = response.message.results; const logs: StampedLog[] = []; let resultsWithTruncatedFigures = 0; const truncatedFigures: Set = new Set(); - if (matchableQNodeIDs.size < MATCH_COUNT_MIN) { + const curieCombosByResult: Map = new Map(); + const curiesByResult: Map> = new Map(); + + const curieCombos: Set = results.reduce((combos: Set, result: TrapiResult) => { + const nodes: Set = traverseResultForNodes(result, response); + const combo: Set = new Set(); + let matchedNodes = 0; + [...nodes].forEach((node) => { + let nodeMatched = false; + const equivalentCuries = node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref') + .value as string[]; + equivalentCuries.forEach((curie) => { + const prefix = curie.split(':')[0]; + const suffix = curie.replace(`${prefix}:`, ''); + if (Object.keys(SUPPORTED_PREFIXES).includes(prefix)) { + combo.add(suffix); + nodeMatched = true; + } + }); + if (nodeMatched) matchedNodes += 1; + }); + if (matchedNodes >= MATCH_COUNT_MIN) { + const comboString = [...combo].join(' '); + curieCombosByResult.set(result, comboString); + combos.add(comboString); + curiesByResult.set(result, combo); + } + return combos; + }, new Set()); + + if (curieCombos.size < 1) { // No TRAPI result can satisfy MATCH_COUNT_MIN logs.push(new LogEntry('DEBUG', null, 'Query does not match criteria, skipping PFOCR figure enrichment.').getLog()); return logs; } - // TODO: currently just NCBIGene CURIEs. Expand to handle any CURIE in PFOCR. - - const trapiResultToCurieSet: Map = new Map(); - - const curieCombinations: Set = new Set( - allTrapiResults.reduce((arr: string[], res) => { - const resultCuries: Set = new Set(); - const matchedQNodes: Set = new Set(); - [...matchableQNodeIDs].forEach((qNodeID) => { - res.node_bindings[qNodeID] - .map((node_binding) => node_binding.id) - .filter((curie) => curie.startsWith('NCBIGene:')) - .forEach((curie) => { - resultCuries.add(curie); - matchedQNodes.add(qNodeID); - }); - }); - - const resultCuriesString = [...resultCuries].map((curie) => curie.replace('NCBIGene:', '')).join(' '); - - if (resultCuries.size >= MATCH_COUNT_MIN && matchedQNodes.size >= MATCH_COUNT_MIN) { - trapiResultToCurieSet.set(res, resultCuriesString); - arr.push(resultCuriesString); - } - - return arr; - }, []), - ); - - const figures = await getPfocrFigures(curieCombinations).catch((err) => { - debug('Error getting PFOCR figures (enrichTrapiResultsWithPfocrFigures)', err); - throw err; - }); + let figures: DeDupedFigureResult[]; + try { + figures = await getPfocrFigures(curieCombos); + } catch (err) { + debug('Error getting PFOCR figures (enrichTrapiResultsWithPfocrFigures)', (err as Error).message); + logs.push( + new LogEntry( + 'WARNING', + null, + `Error getting PFOCR figures, results will not be enriched. The error is ${err.message}`, + ).getLog(), + ); + } + if (!figures) return logs; - debug(`${figures.length} PFOCR figures match at least ${MATCH_COUNT_MIN} genes from any TRAPI result`); + debug(`${figures.length} PFOCR figures match at least ${MATCH_COUNT_MIN} nodes from any TRAPI result`); const figuresByCuries: { [queryCuries: string]: DeDupedFigureResult[] } = {}; figures.forEach((figure) => { @@ -220,24 +253,13 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR // return set; // }, new Set() as Set); - for (const trapiResult of allTrapiResults) { + for (const trapiResult of results) { // No figures match this result - if (!figuresByCuries[trapiResultToCurieSet.get(trapiResult)]) continue; - - const resultCuries: Set = new Set(); - const resultMatchableQNodeIDs: Set = new Set(); - [...matchableQNodeIDs].forEach((qNodeID) => { - trapiResult.node_bindings[qNodeID] - .map((node_binding) => node_binding.id) - .filter((curie) => curie.startsWith('NCBIGene:')) - .forEach((curie) => { - resultCuries.add(curie.replace('NCBIGene:', '')); - resultMatchableQNodeIDs.add(qNodeID); - }); - }); - if (resultMatchableQNodeIDs.size < 2) continue; + if (!figuresByCuries[curieCombosByResult.get(trapiResult)]) continue; + + const resultCuries = curiesByResult.get(trapiResult); - (figuresByCuries[trapiResultToCurieSet.get(trapiResult)] ?? []).forEach((figure) => { + (figuresByCuries[curieCombosByResult.get(trapiResult)] ?? []).forEach((figure) => { if (!('pfocr' in trapiResult)) { trapiResult.pfocr = []; } @@ -245,20 +267,6 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR const figureCurieSet = new Set(figure.associatedWith.mentions.genes.ncbigene); const resultGenesInFigure = intersection(resultCuries, figureCurieSet); - const matchedQNodes = [...matchableQNodeIDs].filter((matchableQNodeID) => { - const currentQNodeCurieSet = new Set( - trapiResult.node_bindings[matchableQNodeID].map((node_binding) => node_binding.id), - ); - - return ( - intersection(currentQNodeCurieSet, new Set([...resultGenesInFigure].map((geneID) => `NCBIGene:${geneID}`))) - .size > 0 - ); - }); - - // If we've matched on 2 curies, but we haven't actually matched on multiple nodes - if (matchedQNodes.length < 2) return; - const otherGenesInFigure = figureCurieSet.size - resultGenesInFigure.size; const resultGenesInOtherFigures = [...resultCuries].filter((gene) => { @@ -273,18 +281,12 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR trapiResult.pfocr.push({ figureUrl: figure.associatedWith.figureUrl, + pfocrUrl: figure.associatedWith.pfocrUrl, pmc: figure.associatedWith.pmc, // TODO: do we want to include figure title? Note: this would need to be added to queryBody. //title: figure.associatedWith.title, matchedCuries: [...resultGenesInFigure].map((geneID) => `NCBIGene:${geneID}`), score: 2 * ((precision * recall) / (precision + recall)), - // 1 - - // parseFloat( - // Analyze([ - // [resultGenesInFigure.size, resultGenesInOtherFigures], - // [otherGenesInFigure, otherGenesInOtherFigures], - // ]).pValue, - // ), }); matchedTrapiResults.add(trapiResult); }); @@ -311,7 +313,7 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR debug(message); logs.push(new LogEntry('DEBUG', null, message).getLog()); debug( - `${MATCH_COUNT_MIN}+ CURIE matches: ${matchedFigures.size} PFOCR figures and ${matchedTrapiResults.size} TRAPI results`, + `${MATCH_COUNT_MIN}+ node matches: ${matchedFigures.size} PFOCR figures across ${matchedTrapiResults.size} TRAPI results`, ); logs.push( new LogEntry( diff --git a/src/results_assembly/query_results.ts b/src/results_assembly/query_results.ts index 51dff31c..9c444916 100644 --- a/src/results_assembly/query_results.ts +++ b/src/results_assembly/query_results.ts @@ -5,7 +5,6 @@ import { zip } from 'lodash'; const debug = Debug('bte:biothings-explorer-trapi:QueryResult'); import { getScores, calculateScore, ScoreCombos } from './score'; import { Record } from '@biothings-explorer/api-response-transform'; -import { enrichTrapiResultsWithPfocrFigures } from './pfocr'; import * as config from '../config'; export interface RecordsByQEdgeID { @@ -477,13 +476,13 @@ export default class TrapiResultsAssembler { .sort((result1, result2) => (result2.analyses[0].score ?? 0) - (result1.analyses[0].score ?? 0)); //sort by decreasing score if (shouldScore) { - try { - const pfocrEnrichmentLogs = await enrichTrapiResultsWithPfocrFigures(this._results); - this.logs.push(...pfocrEnrichmentLogs); - } catch (err) { - debug('Error enriching with PFOCR figures: ', err); - this.logs.push(new LogEntry('DEBUG', null, 'Error enriching with PFOCR figures: ', err).getLog()); - } + // try { + // const pfocrEnrichmentLogs = await enrichTrapiResultsWithPfocrFigures(this._results); + // this.logs.push(...pfocrEnrichmentLogs); + // } catch (err) { + // debug('Error enriching with PFOCR figures: ', err); + // this.logs.push(new LogEntry('DEBUG', null, 'Error enriching with PFOCR figures: ', err).getLog()); + // } debug(`Scored ${resultsWithScore} results with NGD score, scored ${resultsWithoutScore} results without NGD.`); this.logs.push( new LogEntry( @@ -503,7 +502,7 @@ export default class TrapiResultsAssembler { new LogEntry( 'DEBUG', null, - `Scoring/PFOCR figures disabled for KP endpoints; results not scored. Use ARA endpoints (/v1/query or /v1/asyncquery) for scoring/PFOCR figures.`, + `Scoring disabled for KP endpoints; results not scored. Use ARA endpoints (/v1/query or /v1/asyncquery) for scoring.`, { type: 'scoring', scored: resultsWithScore, diff --git a/src/results_assembly/score.ts b/src/results_assembly/score.ts index 5b55c4c6..d23004c6 100644 --- a/src/results_assembly/score.ts +++ b/src/results_assembly/score.ts @@ -7,8 +7,9 @@ import async from 'async'; import _ from 'lodash'; import { ConsolidatedSolutionRecord, RecordsByQEdgeID } from './query_results'; import { Telemetry } from '@biothings-explorer/utils'; +import { AxiosError } from 'axios'; -const tuning_param = 2.0; +const tuning_param = 1.8; const record_weight = 1.0; const text_mined_record_weight = 0.5; @@ -26,6 +27,8 @@ export interface ScoreCombos { // create lookup table for ngd scores in the format: {inputUMLS-outputUMLS: ngd} async function query(queryPairs: string[][]): Promise { + const NGD_TIMEOUT = process.env.NGD_TIMEOUT_MS ? parseInt(process.env.NGD_TIMEOUT_MS) : 10 * 1000; + const url = { dev: 'https://biothings.ci.transltr.io/semmeddb/query/ngd', ci: 'https://biothings.ci.transltr.io/semmeddb/query/ngd', @@ -33,13 +36,23 @@ async function query(queryPairs: string[][]): Promise { prod: 'https://biothings.ncats.io/semmeddb/query/ngd', }[process.env.INSTANCE_ENV ?? 'prod']; const batchSize = 250; - const concurrency_limit = os.cpus().length * 2; + const concurrency_limit = 100; // server handles ~100 requests per second debug('Querying', queryPairs.length, 'combos.'); const chunked_input = _.chunk(queryPairs, batchSize); + const start = Date.now(); + + let successCount = 0; + let successPairCount = 0; + let errCount = 0; + let errPairCount = 0; + try { const response = await async.mapLimit(chunked_input, concurrency_limit, async (input) => { + const timeRemaining = NGD_TIMEOUT - (Date.now() - start); + if (timeRemaining <= 0) return; + const span = Telemetry.startSpan({ description: 'NGDScoreRequest' }); const data = { umls: input, @@ -48,20 +61,33 @@ async function query(queryPairs: string[][]): Promise { span.setData('requestBody', data); try { // const start = performance.now(); - const response = await axios.post(url, data); + const response = await axios.post(url, data, { timeout: timeRemaining }); // const end = performance.now(); span.finish(); + successCount++; + successPairCount += input.length; return response; } catch (err) { - debug(`NGD score query failed: ${err}`); + const timeoutError = err instanceof AxiosError && err.code === AxiosError.ECONNABORTED; + if (!timeoutError) { + errCount++; + errPairCount += input.length; + debug(`NGD score query failed: ${err}`); + } span.finish(); } }); //convert res array into single object with all curies - const result = response - .map((r): ngdScoreCombo[] => r.data.filter((combo: ngdScoreCombo) => Number.isFinite(combo.ngd))) - .flat(); // get numerical scores and flatten array - return result.reduce((acc, cur) => ({ ...acc, [`${cur.umls[0]}-${cur.umls[1]}`]: cur.ngd }), {}); + const result = {}; + for (const res of response) { + if (res == undefined) continue; + for (const combo of res.data) { + if (!Number.isFinite(combo.ngd)) continue; + result[`${combo.umls[0]}-${combo.umls[1]}`] = combo.ngd; + } + } + debug(`${successCount} / ${errCount} / ${chunked_input.length - successCount - errCount} queries successful / errored / timed out, representing ${successPairCount} / ${errPairCount} / ${queryPairs.length - successPairCount - errPairCount} pairs`); + return result; } catch (err) { debug('Failed to query for scores: ', err); } @@ -69,38 +95,52 @@ async function query(queryPairs: string[][]): Promise { // retrieve all ngd scores at once export async function getScores(recordsByQEdgeID: RecordsByQEdgeID): Promise { - const pairs: { [input_umls: string]: Set } = {}; + const pairSet = new Set(); + // organize pairs in layers + // first from each record is first layer, second from each record is second layer, etc. + // this makes it so more records are covered in earlier layers + const organizedPairs: string[][][] = []; + // this stores the "layer" number for each recordHash + const pairCounts: { [hash: string]: number } = {}; let combosWithoutIDs = 0; - Object.values(recordsByQEdgeID).forEach(({ records }) => { - records.forEach((record) => { + for (const { records } of Object.values(recordsByQEdgeID)) { + for (const record of records) { const inputUMLS = record.subject.UMLS || []; const outputUMLS = record.object.UMLS || []; - - inputUMLS?.forEach((input_umls) => { - if (!(input_umls in pairs)) { - pairs[input_umls] = new Set(); - } - outputUMLS?.forEach((output_umls) => { - pairs[input_umls].add(output_umls); - }); - }); + const hash = record.recordHash; if (inputUMLS.length == 0 || outputUMLS.length == 0) { // debug("NO RESULT", record.subject.curie, record.subject.UMLS, record.object.curie, record.object.UMLS) combosWithoutIDs++; + continue; } - }); - }); - const queries = Object.keys(pairs) - .map((inputUMLS) => { - return [...pairs[inputUMLS]].map((outputUMLS) => [inputUMLS, outputUMLS]); - }) - .flat(); + for (const input_umls of inputUMLS) { + for (const output_umls of outputUMLS) { + const pairStr = `${input_umls}\n${output_umls}`; + if (pairSet.has(pairStr)) continue; + pairSet.add(pairStr); + if (pairCounts[hash] == undefined) pairCounts[hash] = 0; + if (organizedPairs.length <= pairCounts[hash]) organizedPairs.push([]); + organizedPairs[pairCounts[hash]].push([input_umls, output_umls]); + pairCounts[hash]++; + } + } + } + } + + const flatPairs = Array(pairSet.size).fill([]); + let i = 0; + for (const pairGroup of organizedPairs) { + for (const pair of pairGroup) { + flatPairs[i] = pair; + i++; + } + } - const results = await query(queries); + const results = await query(flatPairs); debug('Combos no UMLS ID: ', combosWithoutIDs); return results || {}; // in case results is undefined, avoid TypeErrors @@ -109,12 +149,11 @@ export async function getScores(recordsByQEdgeID: RecordsByQEdgeID): Promise