From 7dd4c909b04fb1569498d135b8d22f82a441459a Mon Sep 17 00:00:00 2001 From: rjawesome Date: Tue, 1 Oct 2024 15:57:17 -0700 Subject: [PATCH 1/9] properly merge semmedb sentences --- src/graph/kg_edge.ts | 3 ++- src/graph/knowledge_graph.ts | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/graph/kg_edge.ts b/src/graph/kg_edge.ts index 797c82f1..baee66ea 100644 --- a/src/graph/kg_edge.ts +++ b/src/graph/kg_edge.ts @@ -120,7 +120,8 @@ export default class KGEdge { addAdditionalAttributes(name: string, value: string | string[] | TrapiAttribute[]): void { // special handling for full edge attributes if (name === 'edge-attributes') { - this.attributes[name] = value as TrapiAttribute[]; + if (this.attributes[name]) this.attributes[name] = [...this.attributes[name], ...value as TrapiAttribute[]]; + else this.attributes[name] = value as TrapiAttribute[]; return; } diff --git a/src/graph/knowledge_graph.ts b/src/graph/knowledge_graph.ts index 1a28e9ca..339bc11d 100644 --- a/src/graph/knowledge_graph.ts +++ b/src/graph/knowledge_graph.ts @@ -167,7 +167,14 @@ export default class KnowledgeGraph { }); //handle TRAPI APIs (Situation A of https://github.com/biothings/BioThings_Explorer_TRAPI/issues/208) and APIs that define 'edge-atributes' in x-bte + const seenPmids = new Set(); kgEdge.attributes['edge-attributes']?.forEach((attribute) => { + // Merge SemmedDB sentences + if (attribute.attribute_type_id === "biolink:has_supporting_study_result" && attribute?.attributes?.find((attr) => attr.attribute_type_id === "biolink:publications")) { + const publication = attribute.attributes.find((attr) => attr.attribute_type_id === "biolink:publications").value; + if (seenPmids.has(publication) || seenPmids.size > 50) return; // do not include duplicate publications + seenPmids.add(publication); + } attributes.push(attribute); }); return attributes; From b27a52dc325cdd66176ed41ab0555d34bbf258b4 Mon Sep 17 00:00:00 2001 From: rjawesome Date: Fri, 4 Oct 2024 15:20:50 -0700 Subject: [PATCH 2/9] fix off by one error with pmids --- src/graph/knowledge_graph.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph/knowledge_graph.ts b/src/graph/knowledge_graph.ts index 339bc11d..cf97c704 100644 --- a/src/graph/knowledge_graph.ts +++ b/src/graph/knowledge_graph.ts @@ -172,7 +172,7 @@ export default class KnowledgeGraph { // Merge SemmedDB sentences if (attribute.attribute_type_id === "biolink:has_supporting_study_result" && attribute?.attributes?.find((attr) => attr.attribute_type_id === "biolink:publications")) { const publication = attribute.attributes.find((attr) => attr.attribute_type_id === "biolink:publications").value; - if (seenPmids.has(publication) || seenPmids.size > 50) return; // do not include duplicate publications + if (seenPmids.has(publication) || seenPmids.size >= 50) return; // do not include duplicate publications seenPmids.add(publication); } attributes.push(attribute); From b516a74cc95c1d3fd5e2f1eb52d11b1488b3b85f Mon Sep 17 00:00:00 2001 From: rjawesome Date: Fri, 4 Oct 2024 15:27:03 -0700 Subject: [PATCH 3/9] add evidence count attribute based on semmed sentences --- src/graph/knowledge_graph.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/graph/knowledge_graph.ts b/src/graph/knowledge_graph.ts index cf97c704..2418ceda 100644 --- a/src/graph/knowledge_graph.ts +++ b/src/graph/knowledge_graph.ts @@ -177,6 +177,19 @@ export default class KnowledgeGraph { } attributes.push(attribute); }); + + if (seenPmids.size != 0) { + const evidenceAttr = attributes.find(attr => attr.attribute_type_id === 'biolink:evidence_count'); + if (evidenceAttr) { + evidenceAttr.value = seenPmids.size; + } else { + attributes.push({ + attribute_type_id: 'biolink:evidence_count', + value: seenPmids.size, + }); + } + } + return attributes; } From 1d1f94c260a1755cdc042f2fc7c2783c26f46a14 Mon Sep 17 00:00:00 2001 From: rjawesome Date: Fri, 4 Oct 2024 15:32:17 -0700 Subject: [PATCH 4/9] better comments --- src/graph/knowledge_graph.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/graph/knowledge_graph.ts b/src/graph/knowledge_graph.ts index 2418ceda..4f5f66b9 100644 --- a/src/graph/knowledge_graph.ts +++ b/src/graph/knowledge_graph.ts @@ -169,15 +169,17 @@ export default class KnowledgeGraph { //handle TRAPI APIs (Situation A of https://github.com/biothings/BioThings_Explorer_TRAPI/issues/208) and APIs that define 'edge-atributes' in x-bte const seenPmids = new Set(); kgEdge.attributes['edge-attributes']?.forEach((attribute) => { - // Merge SemmedDB sentences + // Do not add multiple SemmedDB sentences/other "supporting study results" from the same publication if (attribute.attribute_type_id === "biolink:has_supporting_study_result" && attribute?.attributes?.find((attr) => attr.attribute_type_id === "biolink:publications")) { const publication = attribute.attributes.find((attr) => attr.attribute_type_id === "biolink:publications").value; - if (seenPmids.has(publication) || seenPmids.size >= 50) return; // do not include duplicate publications + if (seenPmids.has(publication) || seenPmids.size >= 50) return; // publication has been seen or cap reached seenPmids.add(publication); } + attributes.push(attribute); }); + // update evidence count after PMIDs have been merged (for SemmedDB) if (seenPmids.size != 0) { const evidenceAttr = attributes.find(attr => attr.attribute_type_id === 'biolink:evidence_count'); if (evidenceAttr) { From 3f6ac20f392aebe0f524cae20a485d1a98e8e2c6 Mon Sep 17 00:00:00 2001 From: rjawesome Date: Fri, 4 Oct 2024 16:29:16 -0700 Subject: [PATCH 5/9] store edge attributes as arrays, convert to set later if needed --- src/graph/kg_edge.ts | 8 +++----- src/graph/knowledge_graph.ts | 18 +++++++++++++----- src/index.ts | 2 +- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/graph/kg_edge.ts b/src/graph/kg_edge.ts index 797c82f1..14ec7c40 100644 --- a/src/graph/kg_edge.ts +++ b/src/graph/kg_edge.ts @@ -29,7 +29,7 @@ export default class KGEdge { [qualifier_type_id: string]: string | string[]; }; attributes: { - [attribute_type_id: string]: Set | TrapiAttribute[]; + [attribute_type_id: string]: string[] | TrapiAttribute[]; 'edge-attributes'?: TrapiAttribute[]; }; constructor(id: string, info: KGEdgeInfo) { @@ -125,13 +125,11 @@ export default class KGEdge { } if (!(name in this.attributes)) { - this.attributes[name] = new Set(); + this.attributes[name] = []; } if (!Array.isArray(value)) { value = [value]; } - (value as string[]).map((item) => { - (this.attributes[name] as Set).add(item); - }); + (this.attributes[name] as string[]).push(...(value as string[])); } } diff --git a/src/graph/knowledge_graph.ts b/src/graph/knowledge_graph.ts index 9aa4b2da..c5f2b870 100644 --- a/src/graph/knowledge_graph.ts +++ b/src/graph/knowledge_graph.ts @@ -17,7 +17,8 @@ import { toArray, Telemetry } from '@biothings-explorer/utils'; const debug = Debug('bte:biothings-explorer-trapi:KnowledgeGraph'); -const NON_ARRAY_ATTRIBUTES = ['biolink:knowledge_level', 'biolink:agent_type', 'biolink:evidence_count']; +const NON_ARRAY_ATTRIBUTES = ['biolink:knowledge_level', 'biolink:agent_type']; +const SUM_ATTRIBUTES = ['biolink:evidence_count']; interface SpecialAttributeHandlers { [attribute_type_id: string]: (value: Set, kgEdge: KGEdge) => TrapiAttribute['value']; @@ -149,12 +150,19 @@ export default class KnowledgeGraph { Object.entries(kgEdge.attributes).forEach(([key, value]) => { if (key === 'edge-attributes') return; - let formatted_value: TrapiAttribute['value'] = NON_ARRAY_ATTRIBUTES.includes(key) - ? Array.from(value as Set).reduce((acc, val) => acc + val) - : Array.from(value as Set); + let formatted_value: TrapiAttribute['value']; + if (SUM_ATTRIBUTES.includes(key)) { + // for sums we don't want to remove duplicates + formatted_value = (value as string[]).reduce((acc, val) => acc + val); + } else if (NON_ARRAY_ATTRIBUTES.includes(key)) { + // for non array attributes we want to remove duplicates (ie. same string for knowledge_level multiple times) + formatted_value = Array.from(new Set(value as string[])).reduce((acc, val) => acc + val); + } else { + formatted_value = Array.from(new Set(value as string[])); + } if (key in SPECIAL_ATTRIBUTE_HANDLERS) { - formatted_value = SPECIAL_ATTRIBUTE_HANDLERS[key](value as Set, kgEdge); + formatted_value = SPECIAL_ATTRIBUTE_HANDLERS[key](new Set(value as string[]), kgEdge); } attributes.push({ diff --git a/src/index.ts b/src/index.ts index 9ce86640..e933d563 100644 --- a/src/index.ts +++ b/src/index.ts @@ -278,7 +278,7 @@ export default class TRAPIQueryHandler { ]); this.bteGraph.edges[boundEdgeID] = boundEdge; } else { - (this.bteGraph.edges[boundEdgeID].attributes['biolink:support_graphs'] as Set).add(supportGraphID); + this.bteGraph.edges[boundEdgeID].addAdditionalAttributes('biolink:support_graphs', supportGraphID); } if (!edgesToRebind[edgeID]) edgesToRebind[edgeID] = {}; if (!edgesToRebind[edgeID][subject]) edgesToRebind[edgeID][subject] = {}; From 38f52618ace4f16c1d3b5b68f7fb313c79a7d1a2 Mon Sep 17 00:00:00 2001 From: rjawesome Date: Fri, 18 Oct 2024 10:24:31 -0700 Subject: [PATCH 6/9] remove cap for evidence count --- src/graph/knowledge_graph.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/graph/knowledge_graph.ts b/src/graph/knowledge_graph.ts index 4f5f66b9..4897aeb6 100644 --- a/src/graph/knowledge_graph.ts +++ b/src/graph/knowledge_graph.ts @@ -172,7 +172,11 @@ export default class KnowledgeGraph { // Do not add multiple SemmedDB sentences/other "supporting study results" from the same publication if (attribute.attribute_type_id === "biolink:has_supporting_study_result" && attribute?.attributes?.find((attr) => attr.attribute_type_id === "biolink:publications")) { const publication = attribute.attributes.find((attr) => attr.attribute_type_id === "biolink:publications").value; - if (seenPmids.has(publication) || seenPmids.size >= 50) return; // publication has been seen or cap reached + // publication has been seen or cap reached + if (seenPmids.has(publication) || seenPmids.size >= 50) { + seenPmids.add(publication); + return; + } seenPmids.add(publication); } From 2ecb9ede6e52cc39ba9884eafe5a9b25df5ee26b Mon Sep 17 00:00:00 2001 From: tokebe <43009413+tokebe@users.noreply.github.com> Date: Fri, 18 Oct 2024 16:34:08 -0400 Subject: [PATCH 7/9] feat: support arbitrary node types/prefixes Also requires all matchable result nodes to match on *something* for figure to be deemed relevant to result --- src/results_assembly/pfocr.ts | 318 ++++++++++++++++++---------------- 1 file changed, 172 insertions(+), 146 deletions(-) diff --git a/src/results_assembly/pfocr.ts b/src/results_assembly/pfocr.ts index 21ec7893..9ff87361 100644 --- a/src/results_assembly/pfocr.ts +++ b/src/results_assembly/pfocr.ts @@ -2,51 +2,62 @@ import axios from 'axios'; import Debug from 'debug'; const debug = Debug('bte:biothings-explorer-trapi:pfocr'); import _ from 'lodash'; -import { LogEntry, StampedLog, intersection } from '@biothings-explorer/utils'; +import { LogEntry, StampedLog, intersection, biolink, toArray, removeBioLinkPrefix } from '@biothings-explorer/utils'; import { TrapiResult, TrapiKGNode, TrapiResponse, TrapiKGEdge } from '@biothings-explorer/types'; import Graph from '../graph/graph'; -// the minimum acceptable intersection size between the CURIEs +// The minimum acceptable number of nodes for a figure to match // in a TRAPI result and in a PFOCR figure. -const MATCH_COUNT_MIN = 2; +const MATCHABLE_NODE_MIN = 2; + +// Max number of figures per result const FIGURE_COUNT_MAX = 20; + +// Prefixes that can be searched against PFOCR +// Map supported prefixes as they appear in PFOCR to 'proper' form const SUPPORTED_PREFIXES = { - NCBIGene: 'associatedWith.mentions.genes.ncbigene', + ncbigene: 'NCBIGene', + doid: 'DOID', + mesh: 'MESH', + chebi: 'CHEBI', +}; + +// Supported top-level types and their mappings to PFOCR fields +// Must be top-level possible for biolink ancestry comparison +const SUPPORTED_TYPES = { + Gene: 'genes', + ChemicalEntity: 'chemicals', + DiseaseOrPhenotypicFeature: 'diseases', }; interface pfocrQueryBody { q: string[]; - scopes: string; + scopes: string[]; fields: string[]; - operator: string; - analyzer: string; - minimum_should_match: number; size: number; with_total: boolean; from?: number; } interface FigureResult { + query: string; _id: string; + _score: number; notfound?: boolean; associatedWith: { figureUrl: string; pfocrUrl: string; pmc: string; mentions: { - genes: { - ncbigene: string[]; + [type: string]: { + [prefix: string]: string[]; // list of IDs }; }; }; } -interface RawFigureResult extends FigureResult { - query: string; -} - -interface DeDupedFigureResult extends FigureResult { - query: Set; +interface FiguresByQuery { + [query: string]: FigureResult[]; } /* Get all results by using a scrolling query @@ -60,10 +71,10 @@ async function getAllByScrolling( baseUrl: string, queryBody: pfocrQueryBody, batchIndex: number, - hits: RawFigureResult[] = [], -): Promise { + hits: FigureResult[] = [], +): Promise { queryBody.from = batchIndex; - let data: { hits: RawFigureResult[]; max_total: number }; + let data: { hits: FigureResult[]; max_total: number }; try { data = (await axios.post(baseUrl, queryBody, { timeout: 15000 })).data; } catch (err) { @@ -72,7 +83,7 @@ async function getAllByScrolling( if (data) { hits.push(...data.hits); - debug(`Batch window ${batchIndex}-${batchIndex + 1000}: ${data.hits.length} hits retrieved for PFOCR figure data`); + debug(`Batch window ${batchIndex}-${batchIndex + 1000}: ${data.hits.filter(hit => !hit.notfound).length} hits retrieved for PFOCR figure data`); } if (data && batchIndex + 1000 < data.max_total) { @@ -82,9 +93,9 @@ async function getAllByScrolling( } } -/* qTerms are the CURIEs that go with the 'q' query parameter. - */ -async function getPfocrFigures(qTerms: Set): Promise { +// Combine query terms in acceptable batch sizes +// Then sort figure results by query term +async function getPfocrFigures(qTerms: string[]): Promise { debug(`Getting PFOCR figure data`); const url = { dev: 'https://biothings.ci.transltr.io/pfocr/query', @@ -92,27 +103,20 @@ async function getPfocrFigures(qTerms: Set): Promise { + _.chunk(qTerms, 100).map(async (qTermBatch) => { const queryBody = { - q: [...qTermBatch], - scopes: 'associatedWith.mentions.genes.ncbigene', // TODO better system when we use more than NCBIGene + q: qTermBatch, + scopes: [], fields: [ '_id', - 'associatedWith.mentions.genes.ncbigene', 'associatedWith.pmc', 'associatedWith.figureUrl', 'associatedWith.pfocrUrl', + 'associatedWith.mentions', ], - operator: 'OR', - analyzer: 'whitespace', - minimum_should_match: MATCH_COUNT_MIN, size: 1000, with_total: true, }; @@ -124,26 +128,15 @@ async function getPfocrFigures(qTerms: Set): Promise { - const figureId = figure._id; - if (!figure.notfound && !figuresAdded.has(figureId)) { - figuresAdded.add(figureId); - mergedFigureResults[figureId] = { ...figure, query: new Set([figure.query]) }; - } else if (!figure.notfound && figuresAdded.has(figureId)) { - mergedFigureResults[figureId].query.add(figure.query); - } - }); - - debug(`${Object.values(mergedFigureResults).length} total PFOCR figure hits retrieved`); - return Object.values(mergedFigureResults); + return figureResults.reduce((figuresByQuery: FiguresByQuery, figureResult) => { + if (!figuresByQuery[figureResult.query]) figuresByQuery[figureResult.query] = []; + if (!figureResult.notfound) figuresByQuery[figureResult.query].push(figureResult); + return figuresByQuery; + }, {}); } +// Results bind nodes, but bound edges may recursively use support graphs which reference other nodes +// Traverse result recursively (using stack) and return all related nodes function traverseResultForNodes(result: TrapiResult, response: TrapiResponse): Set { const kg = response.message.knowledge_graph; const nodes: Set = new Set(); @@ -170,62 +163,92 @@ function traverseResultForNodes(result: TrapiResult, response: TrapiResponse): S return nodes; } -/* time complexity: O(t*f) - * where - * t: trapiResults.length - * f: figures.length - */ -export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse): Promise { - // NOTE: This function operates on the actual TRAPI information that will be returned - // to the client. Don't mutate what shouldn't be mutated! +function generateQterms(response: TrapiResponse): { qTerms: string[]; qTermByResults: Map } { const results = response.message.results; - const logs: StampedLog[] = []; - let resultsWithTruncatedFigures = 0; - const truncatedFigures: Set = new Set(); - const curieCombosByResult: Map = new Map(); - const curiesByResult: Map> = new Map(); - - const curieCombos: Set = results.reduce((combos: Set, result: TrapiResult) => { + const qTermByResults: Map = new Map(); + const qTerms = results.reduce((qTerms: string[], result: TrapiResult) => { const nodes: Set = new Set(); Object.values(result.node_bindings).forEach((bindings) => bindings.forEach((binding) => nodes.add(response.message.knowledge_graph.nodes[binding.id])), ); - const combo: Set = new Set(); - let matchedNodes = 0; + // Generate sets per supported node of supported curies + const nodeSets: Set[] = []; + const nodeTypes: string[] = []; [...nodes].forEach((node) => { - let nodeMatched = false; + let supportedCategory = toArray(biolink.getAncestorClasses(removeBioLinkPrefix(node.categories[0]))).find( + (category) => typeof SUPPORTED_TYPES[category] !== 'undefined', + ); + if (!supportedCategory) return; + + const supportedEquivalents: Set = new Set(); const equivalentCuries = (node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref')?.value as string[]) ?? []; equivalentCuries.forEach((curie) => { const prefix = curie.split(':')[0]; - const suffix = curie.replace(`${prefix}:`, ''); - if (Object.keys(SUPPORTED_PREFIXES).includes(prefix)) { - combo.add(suffix); - nodeMatched = true; + if (supportedCategory && Object.keys(SUPPORTED_PREFIXES).includes(prefix.toLowerCase())) { + supportedEquivalents.add(curie); } }); - if (nodeMatched) matchedNodes += 1; + if (supportedEquivalents.size === 0) return; // Node has no supported curies + + nodeSets.push(supportedEquivalents); + nodeTypes.push(SUPPORTED_TYPES[supportedCategory]); }); - if (matchedNodes >= MATCH_COUNT_MIN) { - const comboString = [...combo].join(' '); - curieCombosByResult.set(result, comboString); - combos.add(comboString); - curiesByResult.set(result, combo); - } - return combos; - }, new Set()); - if (curieCombos.size < 1) { + if (nodeSets.length < MATCHABLE_NODE_MIN) return qTerms; // Result doesn't have enough matchable nodes + + // Generate qTerm for result + const qTermParts: string[] = []; + nodeSets.forEach((nodeSet, i) => { + // Separate by prefix for minimal formatting + const idsByPrefix: { [prefix: string]: string[] } = {}; + const nodeType = nodeTypes[i]; + nodeSet.forEach((curie) => { + const prefix = curie.split(':')[0]; + const suffix = curie.replace(`${prefix}:`, ''); + if (!idsByPrefix[prefix]) idsByPrefix[prefix] = []; + idsByPrefix[prefix].push(suffix); + }); + + const orClause: string[] = []; + + Object.entries(idsByPrefix).forEach(([prefix, ids]) => { + orClause.push(`associatedWith.mentions.${nodeType}.${[prefix.toLowerCase()]}:(${ids.join(' OR ')})`); + }); + + qTermParts.push(`(${orClause.join(' OR ')})`); + }); + + const qTerm = qTermParts.join(' AND '); + qTerms.push(qTerm); + qTermByResults.set(result, qTerm); + + return qTerms; + }, []); + + return { qTerms, qTermByResults }; +} + +export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse): Promise { + // NOTE: This function operates on the actual TRAPI information that will be returned + // to the client. Don't mutate what shouldn't be mutated! + + const results = response.message.results; + const logs: StampedLog[] = []; + + const { qTerms, qTermByResults } = generateQterms(response); + + if (qTerms.length < 1) { // No TRAPI result can satisfy MATCH_COUNT_MIN - logs.push(new LogEntry('DEBUG', null, 'Query does not match criteria, skipping PFOCR figure enrichment.').getLog()); + logs.push(new LogEntry('DEBUG', null, 'No result matches criteria, skipping PFOCR figure enrichment.').getLog()); return logs; } - let figures: DeDupedFigureResult[]; + let figures: FiguresByQuery; try { - figures = await getPfocrFigures(curieCombos); + figures = await getPfocrFigures(qTerms); } catch (err) { debug('Error getting PFOCR figures (enrichTrapiResultsWithPfocrFigures)', (err as Error).message); logs.push( @@ -238,102 +261,105 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse } if (!figures) return logs; - debug(`${figures.length} PFOCR figures match at least ${MATCH_COUNT_MIN} nodes from any TRAPI result`); - - const figuresByCuries: { [queryCuries: string]: DeDupedFigureResult[] } = {}; - figures.forEach((figure) => { - [...figure.query].forEach((queryCuries) => { - figuresByCuries[queryCuries] = - queryCuries in figuresByCuries ? [...figuresByCuries[queryCuries], figure] : [figure]; - }); - }); - + // Metrics + const dedupedFigures: { [figureUrl: string]: FigureResult } = {}; const matchedFigures: Set = new Set(); - const matchedTrapiResults: Set = new Set(); + const matchedResults: Set = new Set(); + let resultsWithTruncatedFigures = 0; + const truncatedFigures: Set = new Set(); - const allGenesInAllFigures = figures.reduce((set, fig) => { - fig.associatedWith.mentions.genes.ncbigene.forEach((gene) => set.add(gene)); + // Get all supported curies from every figure. Store as curies to avoid collisions/other issues + const curiesByFigure: { [figureUrl: string]: Set } = {}; + const allCuriesInAllFigures = Object.values(figures).reduce((set, figureSet) => { + figureSet.forEach((figure) => { + if (dedupedFigures[figure.associatedWith.figureUrl]) return; // Already handled + + dedupedFigures[figure.associatedWith.figureUrl] = figure; + const figureCuries = new Set(); + Object.entries(figure.associatedWith.mentions).forEach(([type, prefixes]) => { + if (!Object.values(SUPPORTED_TYPES).includes(type)) return; + + Object.entries(prefixes).forEach(([prefix, ids]) => { + prefix = SUPPORTED_PREFIXES[prefix]; + if (!prefix) return; + ids.forEach((id) => figureCuries.add(`${prefix}:${id}`)); + }); + }); + figureCuries.forEach((curie) => set.add(curie)); + curiesByFigure[figure.associatedWith.figureUrl] = figureCuries; + }); return set; - }, new Set() as Set); + }, new Set()); - for (const trapiResult of results) { - // No figures match this result - if (!figuresByCuries[curieCombosByResult.get(trapiResult)]) continue; + debug( + `${Object.keys(dedupedFigures).length} PFOCR figures match at least ${MATCHABLE_NODE_MIN} nodes from any TRAPI result`, + ); + + // Iterate over results and grab figures, scoring figures and then truncating + results.forEach((result) => { + const resultFigures = figures[qTermByResults.get(result)]; + if (!resultFigures) return; - const resultNodes = traverseResultForNodes(trapiResult, response); + const resultNodes = traverseResultForNodes(result, response); const resultCuries: Set = [...resultNodes].reduce((curies, node) => { const equivalentCuries = (node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref')?.value as string[]) ?? []; equivalentCuries.forEach((curie) => { const prefix = curie.split(':')[0]; const suffix = curie.replace(`${prefix}:`, ''); - if (Object.keys(SUPPORTED_PREFIXES).includes(prefix)) curies.add(suffix); + if (Object.keys(SUPPORTED_PREFIXES).includes(prefix.toLowerCase())) curies.add(curie); }); return curies; }, new Set()); + const resultCuriesInAllFigures = intersection(allCuriesInAllFigures, resultCuries); - const resultGenesInAllFigures = intersection(allGenesInAllFigures, resultCuries); - - (figuresByCuries[curieCombosByResult.get(trapiResult)] ?? []).forEach((figure) => { - if (!('pfocr' in trapiResult)) { - trapiResult.pfocr = []; - } - - const figureCurieSet = new Set(figure.associatedWith.mentions.genes.ncbigene); - const resultGenesInFigure = intersection(resultCuries, figureCurieSet); + resultFigures.forEach((figure) => { + const figureCuries = curiesByFigure[figure.associatedWith.figureUrl]; + const resultCuriesInFigure = intersection(resultCuries, figureCuries); - // let otherGenesInOtherFigures = [...allGenesInAllFigures].filter((gene) => { - // return !resultCuries.has(gene) && !figureCurieSet.has(gene); - // }).length; + const precision = resultCuriesInFigure.size / figureCuries.size; + const recall = resultCuriesInFigure.size / resultCuriesInAllFigures.size; - const precision = resultGenesInFigure.size / figureCurieSet.size; - const recall = resultGenesInFigure.size / resultGenesInAllFigures.size; + if (!('pfocr' in result)) result.pfocr = []; - trapiResult.pfocr.push({ + result.pfocr.push({ + // TODO: do we want to include figure title? Note: this would need to be added to queryBody. + //title: figure.associatedWith.title, figureUrl: figure.associatedWith.figureUrl, pfocrUrl: figure.associatedWith.pfocrUrl, pmc: figure.associatedWith.pmc, - // TODO: do we want to include figure title? Note: this would need to be added to queryBody. - //title: figure.associatedWith.title, - matchedCuries: [...resultGenesInFigure].map((geneID) => `NCBIGene:${geneID}`), + matchedCuries: [...resultCuriesInFigure], score: 2 * ((precision * recall) / (precision + recall)), }); - matchedTrapiResults.add(trapiResult); + matchedResults.add(result); }); + if (!result.pfocr) return logs; // Result had no figures + // Sort by score and cut down to top 20 - const sortedFigures = trapiResult.pfocr.sort((figA, figB) => { + const sortedFigures = result.pfocr.sort((figA, figB) => { return figB.score - figA.score; }); if (sortedFigures.length > FIGURE_COUNT_MAX) { resultsWithTruncatedFigures += 1; sortedFigures.slice(0, 20).forEach((figure) => truncatedFigures.add(figure.figureUrl)); - // debug(`Truncating ${sortedFigures.length} PFOCR figures to ${FIGURE_COUNT_MAX} for TRAPI result w/ curies ${trapiResultToCurieSet.get(trapiResult).split(' ').map((ID) => `NCBIGene:${ID}`).join(', ')}`) + result.pfocr = sortedFigures.slice(0, 20); } - trapiResult.pfocr = sortedFigures.slice(0, 20); - trapiResult.pfocr.map((figure) => matchedFigures.add(figure.figureUrl)); - } + result.pfocr.map((figure) => matchedFigures.add(figure.figureUrl)); + }); - // Each of the matched figures has at least one TRAPI result with an overlap of 2+ genes. - // Each of the matched TRAPI results has at least one figure with an overlap of 2+ genes. + // Each of the matched figures has at least one TRAPI result with a 2+ node overlap of curies with the figure. + // Each of the matched TRAPI results has at least one figure with curies from 2+ of its bound nodes. const unusedFigures = [...truncatedFigures].filter((figureUrl) => !matchedFigures.has(figureUrl)).length; - const message = `${resultsWithTruncatedFigures} results had pfocr figures truncated to max of 20 (${truncatedFigures.size} unique figures removed, ${unusedFigures} not appearing elsewhere in results).`; + let message = `${resultsWithTruncatedFigures} results had pfocr figures truncated to max of 20 (${truncatedFigures.size} unique figures removed, ${unusedFigures} not appearing elsewhere in results).`; debug(message); logs.push(new LogEntry('DEBUG', null, message).getLog()); - debug( - `${MATCH_COUNT_MIN}+ node matches: ${matchedFigures.size} PFOCR figures across ${matchedTrapiResults.size} TRAPI results`, - ); - logs.push( - new LogEntry( - 'INFO', - null, - `${matchedTrapiResults.size} results successfully enriched with ${matchedFigures.size} unique PFOCR figures.`, - ).getLog(), - ); + + message = `${matchedResults.size} results successfully enriched with ${matchedFigures.size} unique PFOCR figures.`; + debug(message); + logs.push(new LogEntry('INFO', null, message).getLog()); return logs; } - -module.exports.enrichTrapiResultsWithPfocrFigures = enrichTrapiResultsWithPfocrFigures; From a6a99c9424e219a414a289965f27d62df756e2a4 Mon Sep 17 00:00:00 2001 From: tokebe <43009413+tokebe@users.noreply.github.com> Date: Fri, 18 Oct 2024 19:05:11 -0400 Subject: [PATCH 8/9] feat: add matchedKGNodes for clarity --- src/results_assembly/pfocr.ts | 48 ++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/src/results_assembly/pfocr.ts b/src/results_assembly/pfocr.ts index 9ff87361..a17c84a5 100644 --- a/src/results_assembly/pfocr.ts +++ b/src/results_assembly/pfocr.ts @@ -83,7 +83,9 @@ async function getAllByScrolling( if (data) { hits.push(...data.hits); - debug(`Batch window ${batchIndex}-${batchIndex + 1000}: ${data.hits.filter(hit => !hit.notfound).length} hits retrieved for PFOCR figure data`); + debug( + `Batch window ${batchIndex}-${batchIndex + 1000}: ${data.hits.filter((hit) => !hit.notfound).length} hits retrieved for PFOCR figure data`, + ); } if (data && batchIndex + 1000 < data.max_total) { @@ -163,14 +165,25 @@ function traverseResultForNodes(result: TrapiResult, response: TrapiResponse): S return nodes; } -function generateQterms(response: TrapiResponse): { qTerms: string[]; qTermByResults: Map } { +function generateQterms(response: TrapiResponse): { + qTerms: string[]; + qTermByResults: Map; + primaryCuriebyCurie: Map; +} { const results = response.message.results; const qTermByResults: Map = new Map(); + const primaryCuriebyCurie: Map = new Map(); const qTerms = results.reduce((qTerms: string[], result: TrapiResult) => { const nodes: Set = new Set(); + const primaryCurieByNode: Map = new Map(); Object.values(result.node_bindings).forEach((bindings) => - bindings.forEach((binding) => nodes.add(response.message.knowledge_graph.nodes[binding.id])), + bindings.forEach((binding) => { + const node = response.message.knowledge_graph.nodes[binding.id]; + nodes.add(node); + primaryCurieByNode.set(node, binding.id); + primaryCuriebyCurie.set(binding.id, binding.id); // Ensure self-primary relationship + }), ); // Generate sets per supported node of supported curies @@ -186,6 +199,8 @@ function generateQterms(response: TrapiResponse): { qTerms: string[]; qTermByRes const equivalentCuries = (node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref')?.value as string[]) ?? []; equivalentCuries.forEach((curie) => { + primaryCuriebyCurie.set(curie, primaryCurieByNode.get(node)); // Keep track of primary for later use + const prefix = curie.split(':')[0]; if (supportedCategory && Object.keys(SUPPORTED_PREFIXES).includes(prefix.toLowerCase())) { supportedEquivalents.add(curie); @@ -228,7 +243,7 @@ function generateQterms(response: TrapiResponse): { qTerms: string[]; qTermByRes return qTerms; }, []); - return { qTerms, qTermByResults }; + return { qTerms, qTermByResults, primaryCuriebyCurie }; } export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse): Promise { @@ -238,7 +253,7 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse const results = response.message.results; const logs: StampedLog[] = []; - const { qTerms, qTermByResults } = generateQterms(response); + const { qTerms, qTermByResults, primaryCuriebyCurie } = generateQterms(response); if (qTerms.length < 1) { // No TRAPI result can satisfy MATCH_COUNT_MIN @@ -320,8 +335,28 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse const precision = resultCuriesInFigure.size / figureCuries.size; const recall = resultCuriesInFigure.size / resultCuriesInAllFigures.size; - if (!('pfocr' in result)) result.pfocr = []; + const matchedCuries = new Set(); + resultCuriesInFigure.forEach((curie) => { + let primary = primaryCuriebyCurie.get(curie); + if (primary) { + matchedCuries.add(primary); + return; + } + // Didn't match, so it's from a node used in an aux graph somewhere + // Thankfully, this is an edge case, and the search space is already pretty small + // So performance hit should be minimal in the vast majority of cases + [...resultNodes].find((node) => { + const equivalentCuries = + (node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref')?.value as string[]) ?? + []; + if (equivalentCuries.includes(curie)) { + matchedCuries.add(equivalentCuries[0]); // First equivalent is always the primary + return true; + } + }); + }); + if (!('pfocr' in result)) result.pfocr = []; result.pfocr.push({ // TODO: do we want to include figure title? Note: this would need to be added to queryBody. //title: figure.associatedWith.title, @@ -329,6 +364,7 @@ export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse pfocrUrl: figure.associatedWith.pfocrUrl, pmc: figure.associatedWith.pmc, matchedCuries: [...resultCuriesInFigure], + matchedKGNodes: [...matchedCuries], score: 2 * ((precision * recall) / (precision + recall)), }); matchedResults.add(result); From eb63a41ed654dfeeae8065286246f6d7e3bab49d Mon Sep 17 00:00:00 2001 From: tokebe <43009413+tokebe@users.noreply.github.com> Date: Mon, 21 Oct 2024 12:46:19 -0400 Subject: [PATCH 9/9] test: update attribute type expectations --- __test__/integration/graph/graph.test.ts | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/__test__/integration/graph/graph.test.ts b/__test__/integration/graph/graph.test.ts index af88dfc9..65d1ca72 100644 --- a/__test__/integration/graph/graph.test.ts +++ b/__test__/integration/graph/graph.test.ts @@ -113,7 +113,7 @@ describe('Test graph class', () => { expect(Array.from(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].apis)).toEqual(['API1']); expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].sources).toHaveProperty('source1'); expect(Array.from(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].publications)).toEqual(['PMID:1', 'PMID:2']); - expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].attributes).toHaveProperty('relation', new Set(['relation1'])); + expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].attributes).toHaveProperty('relation', ['relation1']); }); test('Multiple query results are correctly updated for two edges having same input, predicate and output', () => { @@ -134,13 +134,13 @@ describe('Test graph class', () => { expect(Array.from(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].apis)).toEqual(['API1']); expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].sources).toHaveProperty('source1'); expect(Array.from(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].publications)).toEqual(['PMID:1', 'PMID:2']); - expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].attributes).toHaveProperty('relation', new Set(['relation1'])); + expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].attributes).toHaveProperty('relation', ['relation1']); expect(g.edges).toHaveProperty('6930dcb2e9363817e9f6e736829ce278'); expect(Array.from(g.edges['6930dcb2e9363817e9f6e736829ce278'].apis)).toEqual(['API2']); expect(g.edges['6930dcb2e9363817e9f6e736829ce278'].sources).toHaveProperty('source2'); expect(Array.from(g.edges['6930dcb2e9363817e9f6e736829ce278'].publications)).toEqual(['PMC:1', 'PMC:2']); - expect(g.edges['6930dcb2e9363817e9f6e736829ce278'].attributes).toHaveProperty('relation', new Set(['relation2'])); + expect(g.edges['6930dcb2e9363817e9f6e736829ce278'].attributes).toHaveProperty('relation', ['relation2']); }); test('Multiple query results for different edges are correctly updated', () => { @@ -161,19 +161,19 @@ describe('Test graph class', () => { expect(Array.from(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].apis)).toEqual(['API1']); expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].sources).toHaveProperty('source1'); expect(Array.from(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].publications)).toEqual(['PMID:1', 'PMID:2']); - expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].attributes).toHaveProperty('relation', new Set(['relation1'])); + expect(g.edges['3eb29a4cead0e5f3c3bdca4997bf215b'].attributes).toHaveProperty('relation', ['relation1']); expect(g.edges).toHaveProperty('6930dcb2e9363817e9f6e736829ce278'); expect(Array.from(g.edges['6930dcb2e9363817e9f6e736829ce278'].apis)).toEqual(['API2']); expect(g.edges['6930dcb2e9363817e9f6e736829ce278'].sources).toHaveProperty('source2'); expect(Array.from(g.edges['6930dcb2e9363817e9f6e736829ce278'].publications)).toEqual(['PMC:1', 'PMC:2']); - expect(g.edges['6930dcb2e9363817e9f6e736829ce278'].attributes).toHaveProperty('relation', new Set(['relation2'])); + expect(g.edges['6930dcb2e9363817e9f6e736829ce278'].attributes).toHaveProperty('relation', ['relation2']); expect(g.edges).toHaveProperty('38e8cf1917452c83bb878c5a916ef86a'); expect(Array.from(g.edges['38e8cf1917452c83bb878c5a916ef86a'].apis)).toEqual(['API3']); expect(g.edges['38e8cf1917452c83bb878c5a916ef86a'].sources).toHaveProperty('source3'); expect(Array.from(g.edges['38e8cf1917452c83bb878c5a916ef86a'].publications)).toEqual(['PMC:3', 'PMC:4']); - expect(g.edges['38e8cf1917452c83bb878c5a916ef86a'].attributes).toHaveProperty('relation', new Set(['relation3'])); + expect(g.edges['38e8cf1917452c83bb878c5a916ef86a'].attributes).toHaveProperty('relation', ['relation3']); }); test('Multiple attributes with the same name are merged', () => { @@ -187,9 +187,10 @@ describe('Test graph class', () => { 'PMC:6', 'PMC:7', ]); - expect(g.edges['38e8cf1917452c83bb878c5a916ef86a'].attributes).toHaveProperty( - 'relation', - new Set(['relation3', 'relation3a', 'relation3b']), - ); + expect(g.edges['38e8cf1917452c83bb878c5a916ef86a'].attributes).toHaveProperty('relation', [ + 'relation3', + 'relation3a', + 'relation3b', + ]); }); });