From 5a55739896cd469338ac9cd2fcfbddee6f0f5a97 Mon Sep 17 00:00:00 2001 From: tokebe <43009413+tokebe@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:08:22 -0400 Subject: [PATCH 1/3] feat: PFOCR enrichment takes support graphs into account --- src/index.ts | 4 + src/results_assembly/pfocr.ts | 150 +++++++++++--------------- src/results_assembly/query_results.ts | 17 ++- 3 files changed, 74 insertions(+), 97 deletions(-) diff --git a/src/index.ts b/src/index.ts index 59401b36..95751de3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -29,6 +29,7 @@ import { QueryHandlerOptions } from '@biothings-explorer/types'; import BTEGraph from './graph/graph'; import QEdge from './query_edge'; import { Telemetry } from '@biothings-explorer/utils'; +import { enrichTrapiResultsWithPfocrFigures } from './results_assembly/pfocr'; // Exports for external availability export * from './types'; @@ -703,6 +704,9 @@ export default class TRAPIQueryHandler { this.bteGraph.prune(this.finalizedResults, this.auxGraphs); this.bteGraph.notify(); + // Attempt to enrich results with PFOCR figures + this.logs = [...this.logs, ...(await enrichTrapiResultsWithPfocrFigures(this.getResponse()))]; + span3?.finish(); // check primary knowledge sources diff --git a/src/results_assembly/pfocr.ts b/src/results_assembly/pfocr.ts index 7f68528c..557badff 100644 --- a/src/results_assembly/pfocr.ts +++ b/src/results_assembly/pfocr.ts @@ -4,12 +4,16 @@ const debug = Debug('bte:biothings-explorer-trapi:pfocr'); import { intersection } from '../utils'; import _ from 'lodash'; import { LogEntry, StampedLog } from '@biothings-explorer/utils'; -import { TrapiResult } from '@biothings-explorer/types'; +import { TrapiResult, TrapiKGNode, TrapiResponse, TrapiKGEdge } from '@biothings-explorer/types'; +import Graph from '../graph/graph'; // the minimum acceptable intersection size between the CURIEs // in a TRAPI result and in a PFOCR figure. const MATCH_COUNT_MIN = 2; const FIGURE_COUNT_MAX = 20; +const SUPPORTED_PREFIXES = { + NCBIGene: 'associatedWith.mentions.genes.ncbigene', +}; interface pfocrQueryBody { q: string[]; @@ -124,31 +128,32 @@ async function getPfocrFigures(qTerms: Set): Promise { - const matchableQNodeIDs: Set = new Set(); - - if (allTrapiResults.length === 0) { - return matchableQNodeIDs; - } - - // TODO: this will need to be updated to handle non-NCBIGene CURIEs as well - // as non-gene CURIEs once we support querying for chemicals and diseases. +function traverseResultForNodes(result: TrapiResult, response: TrapiResponse): Set { + const kg = response.message.knowledge_graph; + const nodes: Set = new Set(); + const edgeStack: TrapiKGEdge[] = []; + // TODO: get all nodes from a result, whether it be recursive or iterative. + // First get all bound nodes and edges + Object.values(result.node_bindings).forEach((bindings) => + bindings.forEach((binding) => nodes.add(kg.nodes[binding.id])), + ); + Object.values(result.analyses[0].edge_bindings).forEach((bindings) => + bindings.forEach((binding) => edgeStack.push(kg.edges[binding.id])), + ); - const supportedPrefixes = new Set(['NCBIGene']); - for (const trapiResult of allTrapiResults) { - for (const [qNodeID, nodeBindingValues] of Object.entries(trapiResult.node_bindings)) { - for (const nodeBindingValue of nodeBindingValues) { - const prefix = nodeBindingValue.id.split(':')[0]; - if (supportedPrefixes.has(prefix)) { - matchableQNodeIDs.add(qNodeID); - break; - } - } + while (edgeStack.length > 0) { + const edge = edgeStack.pop(); + nodes.add(kg.nodes[edge.object]); + nodes.add(kg.nodes[edge.subject]); + const supportGraphs = edge.attributes.find((attribute) => attribute.attribute_type_id == 'biolink:support_graphs'); + if (supportGraphs) { + (supportGraphs.value as string[]).forEach((auxGraphID) => + response.message.auxiliary_graphs[auxGraphID].edges.forEach((edgeID) => edgeStack.push(kg.edges[edgeID])), + ); } } - debug(`QNode(s) having CURIEs that PFOCR could potentially match: ${[...matchableQNodeIDs]}`); - return matchableQNodeIDs; + return nodes; } /* time complexity: O(t*f) @@ -156,53 +161,54 @@ function getMatchableQNodeIDs(allTrapiResults: TrapiResult[]): Set { * t: trapiResults.length * f: figures.length */ -export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiResult[]): Promise { - const matchableQNodeIDs = getMatchableQNodeIDs(allTrapiResults); +export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse): Promise { + // NOTE: This function operates on the actual TRAPI information that will be returned + // to the client. Don't mutate what shouldn't be mutated! + const supportedPrefixes = new Set(['NCBIGene']); + const results = response.message.results; const logs: StampedLog[] = []; let resultsWithTruncatedFigures = 0; const truncatedFigures: Set = new Set(); - if (matchableQNodeIDs.size < MATCH_COUNT_MIN) { + const curieCombosByResult: Map = new Map(); + const curieCombos: Set = results.reduce((combos: Set, result: TrapiResult) => { + const nodes: Set = traverseResultForNodes(result, response); + const combo: Set = new Set(); + let matchedNodes = 0; + Object.entries(nodes).forEach(([primaryCurie, node]) => { + let nodeMatched = false; + const equivalentCuries = node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref') + .value as string[]; + [primaryCurie, ...equivalentCuries].forEach((curie) => { + if (supportedPrefixes.has(curie.split(':')[0])) { + combo.add(curie.split(':')[1]); + nodeMatched = true; + } + }); + if (nodeMatched) matchedNodes += 1; + }); + if (matchedNodes >= MATCH_COUNT_MIN) { + const comboString = [...combo].join(' '); + curieCombosByResult.set(result, comboString); + combos.add(comboString); + } + return combos; + }, new Set()); + + if (curieCombos.size < 1) { // No TRAPI result can satisfy MATCH_COUNT_MIN logs.push(new LogEntry('DEBUG', null, 'Query does not match criteria, skipping PFOCR figure enrichment.').getLog()); return logs; } - // TODO: currently just NCBIGene CURIEs. Expand to handle any CURIE in PFOCR. - const trapiResultToCurieSet: Map = new Map(); - const curieCombinations: Set = new Set( - allTrapiResults.reduce((arr: string[], res) => { - const resultCuries: Set = new Set(); - const matchedQNodes: Set = new Set(); - [...matchableQNodeIDs].forEach((qNodeID) => { - res.node_bindings[qNodeID] - .map((node_binding) => node_binding.id) - .filter((curie) => curie.startsWith('NCBIGene:')) - .forEach((curie) => { - resultCuries.add(curie); - matchedQNodes.add(qNodeID); - }); - }); - - const resultCuriesString = [...resultCuries].map((curie) => curie.replace('NCBIGene:', '')).join(' '); - - if (resultCuries.size >= MATCH_COUNT_MIN && matchedQNodes.size >= MATCH_COUNT_MIN) { - trapiResultToCurieSet.set(res, resultCuriesString); - arr.push(resultCuriesString); - } - - return arr; - }, []), - ); - - const figures = await getPfocrFigures(curieCombinations).catch((err) => { + const figures = await getPfocrFigures(curieCombos).catch((err) => { debug('Error getting PFOCR figures (enrichTrapiResultsWithPfocrFigures)', err); throw err; }); - debug(`${figures.length} PFOCR figures match at least ${MATCH_COUNT_MIN} genes from any TRAPI result`); + debug(`${figures.length} PFOCR figures match at least ${MATCH_COUNT_MIN} nodes from any TRAPI result`); const figuresByCuries: { [queryCuries: string]: DeDupedFigureResult[] } = {}; figures.forEach((figure) => { @@ -220,22 +226,11 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR // return set; // }, new Set() as Set); - for (const trapiResult of allTrapiResults) { + for (const trapiResult of results) { // No figures match this result if (!figuresByCuries[trapiResultToCurieSet.get(trapiResult)]) continue; const resultCuries: Set = new Set(); - const resultMatchableQNodeIDs: Set = new Set(); - [...matchableQNodeIDs].forEach((qNodeID) => { - trapiResult.node_bindings[qNodeID] - .map((node_binding) => node_binding.id) - .filter((curie) => curie.startsWith('NCBIGene:')) - .forEach((curie) => { - resultCuries.add(curie.replace('NCBIGene:', '')); - resultMatchableQNodeIDs.add(qNodeID); - }); - }); - if (resultMatchableQNodeIDs.size < 2) continue; (figuresByCuries[trapiResultToCurieSet.get(trapiResult)] ?? []).forEach((figure) => { if (!('pfocr' in trapiResult)) { @@ -245,20 +240,6 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR const figureCurieSet = new Set(figure.associatedWith.mentions.genes.ncbigene); const resultGenesInFigure = intersection(resultCuries, figureCurieSet); - const matchedQNodes = [...matchableQNodeIDs].filter((matchableQNodeID) => { - const currentQNodeCurieSet = new Set( - trapiResult.node_bindings[matchableQNodeID].map((node_binding) => node_binding.id), - ); - - return ( - intersection(currentQNodeCurieSet, new Set([...resultGenesInFigure].map((geneID) => `NCBIGene:${geneID}`))) - .size > 0 - ); - }); - - // If we've matched on 2 curies, but we haven't actually matched on multiple nodes - if (matchedQNodes.length < 2) return; - const otherGenesInFigure = figureCurieSet.size - resultGenesInFigure.size; const resultGenesInOtherFigures = [...resultCuries].filter((gene) => { @@ -278,13 +259,6 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR //title: figure.associatedWith.title, matchedCuries: [...resultGenesInFigure].map((geneID) => `NCBIGene:${geneID}`), score: 2 * ((precision * recall) / (precision + recall)), - // 1 - - // parseFloat( - // Analyze([ - // [resultGenesInFigure.size, resultGenesInOtherFigures], - // [otherGenesInFigure, otherGenesInOtherFigures], - // ]).pValue, - // ), }); matchedTrapiResults.add(trapiResult); }); @@ -311,7 +285,7 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR debug(message); logs.push(new LogEntry('DEBUG', null, message).getLog()); debug( - `${MATCH_COUNT_MIN}+ CURIE matches: ${matchedFigures.size} PFOCR figures and ${matchedTrapiResults.size} TRAPI results`, + `${MATCH_COUNT_MIN}+ node matches: ${matchedFigures.size} PFOCR figures across ${matchedTrapiResults.size} TRAPI results`, ); logs.push( new LogEntry( diff --git a/src/results_assembly/query_results.ts b/src/results_assembly/query_results.ts index 51dff31c..9c444916 100644 --- a/src/results_assembly/query_results.ts +++ b/src/results_assembly/query_results.ts @@ -5,7 +5,6 @@ import { zip } from 'lodash'; const debug = Debug('bte:biothings-explorer-trapi:QueryResult'); import { getScores, calculateScore, ScoreCombos } from './score'; import { Record } from '@biothings-explorer/api-response-transform'; -import { enrichTrapiResultsWithPfocrFigures } from './pfocr'; import * as config from '../config'; export interface RecordsByQEdgeID { @@ -477,13 +476,13 @@ export default class TrapiResultsAssembler { .sort((result1, result2) => (result2.analyses[0].score ?? 0) - (result1.analyses[0].score ?? 0)); //sort by decreasing score if (shouldScore) { - try { - const pfocrEnrichmentLogs = await enrichTrapiResultsWithPfocrFigures(this._results); - this.logs.push(...pfocrEnrichmentLogs); - } catch (err) { - debug('Error enriching with PFOCR figures: ', err); - this.logs.push(new LogEntry('DEBUG', null, 'Error enriching with PFOCR figures: ', err).getLog()); - } + // try { + // const pfocrEnrichmentLogs = await enrichTrapiResultsWithPfocrFigures(this._results); + // this.logs.push(...pfocrEnrichmentLogs); + // } catch (err) { + // debug('Error enriching with PFOCR figures: ', err); + // this.logs.push(new LogEntry('DEBUG', null, 'Error enriching with PFOCR figures: ', err).getLog()); + // } debug(`Scored ${resultsWithScore} results with NGD score, scored ${resultsWithoutScore} results without NGD.`); this.logs.push( new LogEntry( @@ -503,7 +502,7 @@ export default class TrapiResultsAssembler { new LogEntry( 'DEBUG', null, - `Scoring/PFOCR figures disabled for KP endpoints; results not scored. Use ARA endpoints (/v1/query or /v1/asyncquery) for scoring/PFOCR figures.`, + `Scoring disabled for KP endpoints; results not scored. Use ARA endpoints (/v1/query or /v1/asyncquery) for scoring.`, { type: 'scoring', scored: resultsWithScore, From 7145a9e472d10c750aa981822d57b1905e2a615d Mon Sep 17 00:00:00 2001 From: tokebe <43009413+tokebe@users.noreply.github.com> Date: Thu, 1 Aug 2024 14:40:36 -0400 Subject: [PATCH 2/3] fix: node handling, curieCombosByResults --- src/results_assembly/pfocr.ts | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/results_assembly/pfocr.ts b/src/results_assembly/pfocr.ts index 557badff..5d752d04 100644 --- a/src/results_assembly/pfocr.ts +++ b/src/results_assembly/pfocr.ts @@ -132,8 +132,6 @@ function traverseResultForNodes(result: TrapiResult, response: TrapiResponse): S const kg = response.message.knowledge_graph; const nodes: Set = new Set(); const edgeStack: TrapiKGEdge[] = []; - // TODO: get all nodes from a result, whether it be recursive or iterative. - // First get all bound nodes and edges Object.values(result.node_bindings).forEach((bindings) => bindings.forEach((binding) => nodes.add(kg.nodes[binding.id])), ); @@ -164,7 +162,6 @@ function traverseResultForNodes(result: TrapiResult, response: TrapiResponse): S export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse): Promise { // NOTE: This function operates on the actual TRAPI information that will be returned // to the client. Don't mutate what shouldn't be mutated! - const supportedPrefixes = new Set(['NCBIGene']); const results = response.message.results; const logs: StampedLog[] = []; let resultsWithTruncatedFigures = 0; @@ -175,12 +172,12 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse const nodes: Set = traverseResultForNodes(result, response); const combo: Set = new Set(); let matchedNodes = 0; - Object.entries(nodes).forEach(([primaryCurie, node]) => { + [...nodes].forEach((node) => { let nodeMatched = false; const equivalentCuries = node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref') .value as string[]; - [primaryCurie, ...equivalentCuries].forEach((curie) => { - if (supportedPrefixes.has(curie.split(':')[0])) { + equivalentCuries.forEach((curie) => { + if (Object.keys(SUPPORTED_PREFIXES).includes(curie.split(':')[0])) { combo.add(curie.split(':')[1]); nodeMatched = true; } @@ -201,8 +198,6 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse return logs; } - const trapiResultToCurieSet: Map = new Map(); - const figures = await getPfocrFigures(curieCombos).catch((err) => { debug('Error getting PFOCR figures (enrichTrapiResultsWithPfocrFigures)', err); throw err; @@ -228,11 +223,11 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse for (const trapiResult of results) { // No figures match this result - if (!figuresByCuries[trapiResultToCurieSet.get(trapiResult)]) continue; + if (!figuresByCuries[curieCombosByResult.get(trapiResult)]) continue; const resultCuries: Set = new Set(); - (figuresByCuries[trapiResultToCurieSet.get(trapiResult)] ?? []).forEach((figure) => { + (figuresByCuries[curieCombosByResult.get(trapiResult)] ?? []).forEach((figure) => { if (!('pfocr' in trapiResult)) { trapiResult.pfocr = []; } From fedd52617d1f29e386aa445f0293d627dce933d2 Mon Sep 17 00:00:00 2001 From: tokebe <43009413+tokebe@users.noreply.github.com> Date: Thu, 1 Aug 2024 15:15:05 -0400 Subject: [PATCH 3/3] fix: add pfocrUrl, fix curie handling for score --- src/results_assembly/pfocr.ts | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/results_assembly/pfocr.ts b/src/results_assembly/pfocr.ts index 5d752d04..c122adb6 100644 --- a/src/results_assembly/pfocr.ts +++ b/src/results_assembly/pfocr.ts @@ -32,6 +32,7 @@ interface FigureResult { notfound?: boolean; associatedWith: { figureUrl: string; + pfocrUrl: string; pmc: string; mentions: { genes: { @@ -93,7 +94,13 @@ async function getPfocrFigures(qTerms: Set): Promise = new Set(); const curieCombosByResult: Map = new Map(); + const curiesByResult: Map> = new Map(); + const curieCombos: Set = results.reduce((combos: Set, result: TrapiResult) => { const nodes: Set = traverseResultForNodes(result, response); const combo: Set = new Set(); @@ -177,8 +186,10 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse const equivalentCuries = node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref') .value as string[]; equivalentCuries.forEach((curie) => { - if (Object.keys(SUPPORTED_PREFIXES).includes(curie.split(':')[0])) { - combo.add(curie.split(':')[1]); + const prefix = curie.split(':')[0]; + const suffix = curie.replace(`${prefix}:`, ''); + if (Object.keys(SUPPORTED_PREFIXES).includes(prefix)) { + combo.add(suffix); nodeMatched = true; } }); @@ -188,6 +199,7 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse const comboString = [...combo].join(' '); curieCombosByResult.set(result, comboString); combos.add(comboString); + curiesByResult.set(result, combo); } return combos; }, new Set()); @@ -225,7 +237,7 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse // No figures match this result if (!figuresByCuries[curieCombosByResult.get(trapiResult)]) continue; - const resultCuries: Set = new Set(); + const resultCuries = curiesByResult.get(trapiResult); (figuresByCuries[curieCombosByResult.get(trapiResult)] ?? []).forEach((figure) => { if (!('pfocr' in trapiResult)) { @@ -249,6 +261,7 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse trapiResult.pfocr.push({ figureUrl: figure.associatedWith.figureUrl, + pfocrUrl: figure.associatedWith.pfocrUrl, pmc: figure.associatedWith.pmc, // TODO: do we want to include figure title? Note: this would need to be added to queryBody. //title: figure.associatedWith.title,