Skip to content

Commit

Permalink
Merge pull request #3400 from owid/search-evaluation-algolia
Browse files Browse the repository at this point in the history
🔨 Evaluate search performance for articles
  • Loading branch information
marcelgerber authored May 1, 2024
2 parents dff12bf + 44b44d7 commit d30a2b7
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 0 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ help:
@echo ' make sync-images sync all images from the remote master'
@echo ' make update.chart-entities update the charts_x_entities join table'
@echo ' make reindex reindex (or initialise) search in Algolia'
@echo ' make bench.search run search benchmarks'
@echo
@echo ' OPS (staff-only)'
@echo ' make deploy Deploy your local site to production'
Expand Down Expand Up @@ -359,5 +360,9 @@ reindex: itsJustJavascript
node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js

bench.search: itsJustJavascript
@echo '==> Running search benchmarks'
@node --enable-source-maps itsJustJavascript/site/search/evaluateSearch.js

clean:
rm -rf node_modules itsJustJavascript
4 changes: 4 additions & 0 deletions settings/serverSettings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,7 @@ export const OPENAI_API_KEY: string = serverSettings.OPENAI_API_KEY ?? ""

export const SLACK_BOT_OAUTH_TOKEN: string =
serverSettings.SLACK_BOT_OAUTH_TOKEN ?? ""

// search evaluation
export const SEARCH_EVAL_URL: string =
"https://pub-ec761fe0df554b02bc605610f3296000.r2.dev"
171 changes: 171 additions & 0 deletions site/search/evaluateSearch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/**
* Simulate searches against our Algolia index and evaluate the results.
*/

import {
ALGOLIA_ID,
ALGOLIA_SEARCH_KEY,
} from "../../settings/clientSettings.js"
import { SEARCH_EVAL_URL } from "../../settings/serverSettings.js"
import { getIndexName } from "./searchClient.js"
import algoliasearch from "algoliasearch"

/* eslint-disable no-console */

// this many articles are displayed un-collapsed, only score this many results
const N_ARTICLES_QUICK_RESULTS = 2
const N_ARTICLES_LONG_RESULTS = 4

const CONCURRENT_QUERIES = 10

type QueryDataset = {
name: string
queries: Query[]
}

type Scores = { [key: string]: number }

type Query = {
query: string
slugs: string[]
}

type ScoredQuery = {
query: string
expected: string[]
actual: string[]
scores: Scores
}

type SearchResults = {
name: string
scope: "articles" | "charts" | "all"
scores: Scores
numQueries: number
algoliaApp: string
algoliaIndex: string
}

const QUERY_FILES = {
single: "synthetic-queries-single-2024-03-25.json",
multi: "synthetic-queries-2024-03-25.json",
}

const main = async (): Promise<void> => {
// only do the multi, since it contains the single-word set as well
await evaluateAndPrint(QUERY_FILES.multi)
}

const evaluateAndPrint = async (name: string): Promise<void> => {
const results = await evaluateArticleSearch(name)
console.log(JSON.stringify(results, null, 2))
}

const evaluateArticleSearch = async (name: string): Promise<SearchResults> => {
const ds = await fetchQueryDataset(name)
const indexName = getIndexName("pages")

// make a search client
const client = getClient()
const index = client.initIndex(indexName)

// run the evaluation
const results = await simulateQueries(index, ds.queries)
const scores: Scores = {}
for (const scoreName of Object.keys(results[0].scores)) {
const mean =
results.map((r) => r.scores[scoreName]).reduce((a, b) => a + b) /
results.length
scores[scoreName] = parseFloat(mean.toFixed(3))
}

// print the results to two decimal places
return {
name: ds.name,
scope: "articles",
scores: scores,
numQueries: ds.queries.length,
algoliaApp: ALGOLIA_ID,
algoliaIndex: indexName,
}
}

const getClient = (): any => {
const client = algoliasearch(ALGOLIA_ID, ALGOLIA_SEARCH_KEY)
return client
}

const fetchQueryDataset = async (name: string): Promise<QueryDataset> => {
const url: string = `${SEARCH_EVAL_URL}/${name}`
const resp = await fetch(url)
const jsonData = await resp.json()
return { name, queries: jsonData }
}

const simulateQuery = async (
index: any,
query: Query
): Promise<ScoredQuery> => {
const { hits } = await index.search(query.query)
const actual = hits.map((h: any) => h.slug)
const scores = scoreResults(query.slugs, actual)
return { query: query.query, expected: query.slugs, actual, scores }
}

const scoreResults = (relevant: string[], actual: string[]): Scores => {
const scores: Scores = {}

for (const k of [N_ARTICLES_QUICK_RESULTS, N_ARTICLES_LONG_RESULTS]) {
const key = `precision@${k}`
const actualTruncated = actual.slice(0, k)
const n = actualTruncated.length
if (n === 0) {
scores[key] = 0
continue
}

const correct = actualTruncated.filter((a) =>
relevant.includes(a)
).length
scores[key] = correct / n
}
return scores
}

const simulateQueries = async (
index: any,
queries: Query[]
): Promise<ScoredQuery[]> => {
// NOTE: should be a rate-limited version of:
//
// const scores = await Promise.all(
// queries.map((query) => simulateQuery(index, query))
// )

let activeQueries = 0
let i = 0
const scores: ScoredQuery[] = []

const next = async () => {
if (i >= queries.length) return
const query = queries[i++]
activeQueries++
const score = await simulateQuery(index, query)
scores.push(score)
activeQueries--
if (i < queries.length) {
await next()
}
}

const promises = []
while (activeQueries < CONCURRENT_QUERIES && i < queries.length) {
promises.push(next())
}

await Promise.all(promises)

return scores
}

main()

Check failure on line 171 in site/search/evaluateSearch.ts

View workflow job for this annotation

GitHub Actions / eslint

Promises must be awaited, end with a call to .catch, end with a call to .then with a rejection handler or be explicitly marked as ignored with the `void` operator

Check failure on line 171 in site/search/evaluateSearch.ts

View workflow job for this annotation

GitHub Actions / eslint

Promises must be awaited, end with a call to .catch, end with a call to .then with a rejection handler or be explicitly marked as ignored with the `void` operator

0 comments on commit d30a2b7

Please sign in to comment.