Skip to content

Commit

Permalink
Merge pull request #201 from biothings/fix-837
Browse files Browse the repository at this point in the history
PFOCR Result enrichment support for nested support graphs
  • Loading branch information
tokebe authored Aug 2, 2024
2 parents 243998e + fedd526 commit 1da1bde
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 104 deletions.
4 changes: 4 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import { QueryHandlerOptions } from '@biothings-explorer/types';
import BTEGraph from './graph/graph';
import QEdge from './query_edge';
import { Telemetry } from '@biothings-explorer/utils';
import { enrichTrapiResultsWithPfocrFigures } from './results_assembly/pfocr';

// Exports for external availability
export * from './types';
Expand Down Expand Up @@ -703,6 +704,9 @@ export default class TRAPIQueryHandler {
this.bteGraph.prune(this.finalizedResults, this.auxGraphs);
this.bteGraph.notify();

// Attempt to enrich results with PFOCR figures
this.logs = [...this.logs, ...(await enrichTrapiResultsWithPfocrFigures(this.getResponse()))];

span3?.finish();

// check primary knowledge sources
Expand Down
172 changes: 77 additions & 95 deletions src/results_assembly/pfocr.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@ const debug = Debug('bte:biothings-explorer-trapi:pfocr');
import { intersection } from '../utils';
import _ from 'lodash';
import { LogEntry, StampedLog } from '@biothings-explorer/utils';
import { TrapiResult } from '@biothings-explorer/types';
import { TrapiResult, TrapiKGNode, TrapiResponse, TrapiKGEdge } from '@biothings-explorer/types';
import Graph from '../graph/graph';

// the minimum acceptable intersection size between the CURIEs
// in a TRAPI result and in a PFOCR figure.
const MATCH_COUNT_MIN = 2;
const FIGURE_COUNT_MAX = 20;
const SUPPORTED_PREFIXES = {
NCBIGene: 'associatedWith.mentions.genes.ncbigene',
};

interface pfocrQueryBody {
q: string[];
Expand All @@ -28,6 +32,7 @@ interface FigureResult {
notfound?: boolean;
associatedWith: {
figureUrl: string;
pfocrUrl: string;
pmc: string;
mentions: {
genes: {
Expand Down Expand Up @@ -89,7 +94,13 @@ async function getPfocrFigures(qTerms: Set<string>): Promise<DeDupedFigureResult
const queryBody = {
q: [...qTermBatch],
scopes: 'associatedWith.mentions.genes.ncbigene', // TODO better system when we use more than NCBIGene
fields: ['_id', 'associatedWith.mentions.genes.ncbigene', 'associatedWith.pmc', 'associatedWith.figureUrl'],
fields: [
'_id',
'associatedWith.mentions.genes.ncbigene',
'associatedWith.pmc',
'associatedWith.figureUrl',
'associatedWith.pfocrUrl',
],
operator: 'OR',
analyzer: 'whitespace',
minimum_should_match: MATCH_COUNT_MIN,
Expand Down Expand Up @@ -124,85 +135,87 @@ async function getPfocrFigures(qTerms: Set<string>): Promise<DeDupedFigureResult
return Object.values(mergedFigureResults);
}

function getMatchableQNodeIDs(allTrapiResults: TrapiResult[]): Set<string> {
const matchableQNodeIDs: Set<string> = new Set();

if (allTrapiResults.length === 0) {
return matchableQNodeIDs;
}
function traverseResultForNodes(result: TrapiResult, response: TrapiResponse): Set<TrapiKGNode> {
const kg = response.message.knowledge_graph;
const nodes: Set<TrapiKGNode> = new Set();
const edgeStack: TrapiKGEdge[] = [];
Object.values(result.node_bindings).forEach((bindings) =>
bindings.forEach((binding) => nodes.add(kg.nodes[binding.id])),
);
Object.values(result.analyses[0].edge_bindings).forEach((bindings) =>
bindings.forEach((binding) => edgeStack.push(kg.edges[binding.id])),
);

// TODO: this will need to be updated to handle non-NCBIGene CURIEs as well
// as non-gene CURIEs once we support querying for chemicals and diseases.

const supportedPrefixes = new Set(['NCBIGene']);
for (const trapiResult of allTrapiResults) {
for (const [qNodeID, nodeBindingValues] of Object.entries(trapiResult.node_bindings)) {
for (const nodeBindingValue of nodeBindingValues) {
const prefix = nodeBindingValue.id.split(':')[0];
if (supportedPrefixes.has(prefix)) {
matchableQNodeIDs.add(qNodeID);
break;
}
}
while (edgeStack.length > 0) {
const edge = edgeStack.pop();
nodes.add(kg.nodes[edge.object]);
nodes.add(kg.nodes[edge.subject]);
const supportGraphs = edge.attributes.find((attribute) => attribute.attribute_type_id == 'biolink:support_graphs');
if (supportGraphs) {
(supportGraphs.value as string[]).forEach((auxGraphID) =>
response.message.auxiliary_graphs[auxGraphID].edges.forEach((edgeID) => edgeStack.push(kg.edges[edgeID])),
);
}
}

debug(`QNode(s) having CURIEs that PFOCR could potentially match: ${[...matchableQNodeIDs]}`);
return matchableQNodeIDs;
return nodes;
}

/* time complexity: O(t*f)
* where
* t: trapiResults.length
* f: figures.length
*/
export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiResult[]): Promise<StampedLog[]> {
const matchableQNodeIDs = getMatchableQNodeIDs(allTrapiResults);
export async function enrichTrapiResultsWithPfocrFigures(response: TrapiResponse): Promise<StampedLog[]> {
// NOTE: This function operates on the actual TRAPI information that will be returned
// to the client. Don't mutate what shouldn't be mutated!
const results = response.message.results;
const logs: StampedLog[] = [];
let resultsWithTruncatedFigures = 0;
const truncatedFigures: Set<string> = new Set();

if (matchableQNodeIDs.size < MATCH_COUNT_MIN) {
const curieCombosByResult: Map<TrapiResult, string> = new Map();
const curiesByResult: Map<TrapiResult, Set<string>> = new Map();

const curieCombos: Set<string> = results.reduce((combos: Set<string>, result: TrapiResult) => {
const nodes: Set<TrapiKGNode> = traverseResultForNodes(result, response);
const combo: Set<string> = new Set();
let matchedNodes = 0;
[...nodes].forEach((node) => {
let nodeMatched = false;
const equivalentCuries = node.attributes?.find((attribute) => attribute.attribute_type_id === 'biolink:xref')
.value as string[];
equivalentCuries.forEach((curie) => {
const prefix = curie.split(':')[0];
const suffix = curie.replace(`${prefix}:`, '');
if (Object.keys(SUPPORTED_PREFIXES).includes(prefix)) {
combo.add(suffix);
nodeMatched = true;
}
});
if (nodeMatched) matchedNodes += 1;
});
if (matchedNodes >= MATCH_COUNT_MIN) {
const comboString = [...combo].join(' ');
curieCombosByResult.set(result, comboString);
combos.add(comboString);
curiesByResult.set(result, combo);
}
return combos;
}, new Set<string>());

if (curieCombos.size < 1) {
// No TRAPI result can satisfy MATCH_COUNT_MIN
logs.push(new LogEntry('DEBUG', null, 'Query does not match criteria, skipping PFOCR figure enrichment.').getLog());
return logs;
}

// TODO: currently just NCBIGene CURIEs. Expand to handle any CURIE in PFOCR.

const trapiResultToCurieSet: Map<TrapiResult, string> = new Map();

const curieCombinations: Set<string> = new Set(
allTrapiResults.reduce((arr: string[], res) => {
const resultCuries: Set<string> = new Set();
const matchedQNodes: Set<string> = new Set();
[...matchableQNodeIDs].forEach((qNodeID) => {
res.node_bindings[qNodeID]
.map((node_binding) => node_binding.id)
.filter((curie) => curie.startsWith('NCBIGene:'))
.forEach((curie) => {
resultCuries.add(curie);
matchedQNodes.add(qNodeID);
});
});

const resultCuriesString = [...resultCuries].map((curie) => curie.replace('NCBIGene:', '')).join(' ');

if (resultCuries.size >= MATCH_COUNT_MIN && matchedQNodes.size >= MATCH_COUNT_MIN) {
trapiResultToCurieSet.set(res, resultCuriesString);
arr.push(resultCuriesString);
}

return arr;
}, []),
);

const figures = await getPfocrFigures(curieCombinations).catch((err) => {
const figures = await getPfocrFigures(curieCombos).catch((err) => {
debug('Error getting PFOCR figures (enrichTrapiResultsWithPfocrFigures)', err);
throw err;
});

debug(`${figures.length} PFOCR figures match at least ${MATCH_COUNT_MIN} genes from any TRAPI result`);
debug(`${figures.length} PFOCR figures match at least ${MATCH_COUNT_MIN} nodes from any TRAPI result`);

const figuresByCuries: { [queryCuries: string]: DeDupedFigureResult[] } = {};
figures.forEach((figure) => {
Expand All @@ -220,45 +233,20 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR
// return set;
// }, new Set() as Set<string>);

for (const trapiResult of allTrapiResults) {
for (const trapiResult of results) {
// No figures match this result
if (!figuresByCuries[trapiResultToCurieSet.get(trapiResult)]) continue;

const resultCuries: Set<string> = new Set();
const resultMatchableQNodeIDs: Set<string> = new Set();
[...matchableQNodeIDs].forEach((qNodeID) => {
trapiResult.node_bindings[qNodeID]
.map((node_binding) => node_binding.id)
.filter((curie) => curie.startsWith('NCBIGene:'))
.forEach((curie) => {
resultCuries.add(curie.replace('NCBIGene:', ''));
resultMatchableQNodeIDs.add(qNodeID);
});
});
if (resultMatchableQNodeIDs.size < 2) continue;
if (!figuresByCuries[curieCombosByResult.get(trapiResult)]) continue;

const resultCuries = curiesByResult.get(trapiResult);

(figuresByCuries[trapiResultToCurieSet.get(trapiResult)] ?? []).forEach((figure) => {
(figuresByCuries[curieCombosByResult.get(trapiResult)] ?? []).forEach((figure) => {
if (!('pfocr' in trapiResult)) {
trapiResult.pfocr = [];
}

const figureCurieSet = new Set(figure.associatedWith.mentions.genes.ncbigene);
const resultGenesInFigure = intersection(resultCuries, figureCurieSet);

const matchedQNodes = [...matchableQNodeIDs].filter((matchableQNodeID) => {
const currentQNodeCurieSet = new Set(
trapiResult.node_bindings[matchableQNodeID].map((node_binding) => node_binding.id),
);

return (
intersection(currentQNodeCurieSet, new Set([...resultGenesInFigure].map((geneID) => `NCBIGene:${geneID}`)))
.size > 0
);
});

// If we've matched on 2 curies, but we haven't actually matched on multiple nodes
if (matchedQNodes.length < 2) return;

const otherGenesInFigure = figureCurieSet.size - resultGenesInFigure.size;

const resultGenesInOtherFigures = [...resultCuries].filter((gene) => {
Expand All @@ -273,18 +261,12 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR

trapiResult.pfocr.push({
figureUrl: figure.associatedWith.figureUrl,
pfocrUrl: figure.associatedWith.pfocrUrl,
pmc: figure.associatedWith.pmc,
// TODO: do we want to include figure title? Note: this would need to be added to queryBody.
//title: figure.associatedWith.title,
matchedCuries: [...resultGenesInFigure].map((geneID) => `NCBIGene:${geneID}`),
score: 2 * ((precision * recall) / (precision + recall)),
// 1 -
// parseFloat(
// Analyze([
// [resultGenesInFigure.size, resultGenesInOtherFigures],
// [otherGenesInFigure, otherGenesInOtherFigures],
// ]).pValue,
// ),
});
matchedTrapiResults.add(trapiResult);
});
Expand All @@ -311,7 +293,7 @@ export async function enrichTrapiResultsWithPfocrFigures(allTrapiResults: TrapiR
debug(message);
logs.push(new LogEntry('DEBUG', null, message).getLog());
debug(
`${MATCH_COUNT_MIN}+ CURIE matches: ${matchedFigures.size} PFOCR figures and ${matchedTrapiResults.size} TRAPI results`,
`${MATCH_COUNT_MIN}+ node matches: ${matchedFigures.size} PFOCR figures across ${matchedTrapiResults.size} TRAPI results`,
);
logs.push(
new LogEntry(
Expand Down
17 changes: 8 additions & 9 deletions src/results_assembly/query_results.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import { zip } from 'lodash';
const debug = Debug('bte:biothings-explorer-trapi:QueryResult');
import { getScores, calculateScore, ScoreCombos } from './score';
import { Record } from '@biothings-explorer/api-response-transform';
import { enrichTrapiResultsWithPfocrFigures } from './pfocr';
import * as config from '../config';

export interface RecordsByQEdgeID {
Expand Down Expand Up @@ -477,13 +476,13 @@ export default class TrapiResultsAssembler {
.sort((result1, result2) => (result2.analyses[0].score ?? 0) - (result1.analyses[0].score ?? 0)); //sort by decreasing score

if (shouldScore) {
try {
const pfocrEnrichmentLogs = await enrichTrapiResultsWithPfocrFigures(this._results);
this.logs.push(...pfocrEnrichmentLogs);
} catch (err) {
debug('Error enriching with PFOCR figures: ', err);
this.logs.push(new LogEntry('DEBUG', null, 'Error enriching with PFOCR figures: ', err).getLog());
}
// try {
// const pfocrEnrichmentLogs = await enrichTrapiResultsWithPfocrFigures(this._results);
// this.logs.push(...pfocrEnrichmentLogs);
// } catch (err) {
// debug('Error enriching with PFOCR figures: ', err);
// this.logs.push(new LogEntry('DEBUG', null, 'Error enriching with PFOCR figures: ', err).getLog());
// }
debug(`Scored ${resultsWithScore} results with NGD score, scored ${resultsWithoutScore} results without NGD.`);
this.logs.push(
new LogEntry(
Expand All @@ -503,7 +502,7 @@ export default class TrapiResultsAssembler {
new LogEntry(
'DEBUG',
null,
`Scoring/PFOCR figures disabled for KP endpoints; results not scored. Use ARA endpoints (/v1/query or /v1/asyncquery) for scoring/PFOCR figures.`,
`Scoring disabled for KP endpoints; results not scored. Use ARA endpoints (/v1/query or /v1/asyncquery) for scoring.`,
{
type: 'scoring',
scored: resultsWithScore,
Expand Down

0 comments on commit 1da1bde

Please sign in to comment.