owid · marcelgerber · May 1, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/Makefile b/Makefile
@@ -41,6 +41,7 @@ help:
 	@echo '  make sync-images            sync all images from the remote master'
 	@echo '  make update.chart-entities  update the charts_x_entities join table'
 	@echo '  make reindex                reindex (or initialise) search in Algolia'
+  @echo '  make bench.search           run search benchmarks'
 	@echo
 	@echo '  OPS (staff-only)'
 	@echo '  make deploy                 Deploy your local site to production'
@@ -359,5 +360,9 @@ reindex: itsJustJavascript
 	node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js
 	node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js
 
+bench.search: itsJustJavascript
+	@echo '==> Running search benchmarks'
+	@node --enable-source-maps itsJustJavascript/site/search/evaluateSearch.js
+
 clean:
 	rm -rf node_modules itsJustJavascript
diff --git a/settings/serverSettings.ts b/settings/serverSettings.ts
@@ -205,3 +205,7 @@ export const OPENAI_API_KEY: string = serverSettings.OPENAI_API_KEY ?? ""
 
 export const SLACK_BOT_OAUTH_TOKEN: string =
     serverSettings.SLACK_BOT_OAUTH_TOKEN ?? ""
+
+// search evaluation
+export const SEARCH_EVAL_URL: string =
+    "https://pub-ec761fe0df554b02bc605610f3296000.r2.dev"
diff --git a/site/search/evaluateSearch.ts b/site/search/evaluateSearch.ts
@@ -0,0 +1,171 @@
+/**
+ * Simulate searches against our Algolia index and evaluate the results.
+ */
+
+import {
+    ALGOLIA_ID,
+    ALGOLIA_SEARCH_KEY,
+} from "../../settings/clientSettings.js"
+import { SEARCH_EVAL_URL } from "../../settings/serverSettings.js"
+import { getIndexName } from "./searchClient.js"
+import algoliasearch from "algoliasearch"
-import algoliasearch from "algoliasearch"
+import algoliasearch, { SearchClient, SearchIndex } from "algoliasearch"
-import algoliasearch from "algoliasearch"
+import algoliasearch, { SearchClient, SearchIndex } from "algoliasearch"
+
+/* eslint-disable no-console */
+
+// this many articles are displayed un-collapsed, only score this many results
+const N_ARTICLES_QUICK_RESULTS = 2
+const N_ARTICLES_LONG_RESULTS = 4
+
+const CONCURRENT_QUERIES = 10
+
+type QueryDataset = {
+    name: string
+    queries: Query[]
+}
+
+type Scores = { [key: string]: number }
+
+type Query = {
+    query: string
+    slugs: string[]
+}
+
+type ScoredQuery = {
+    query: string
+    expected: string[]
+    actual: string[]
+    scores: Scores
+}
+
+type SearchResults = {
+    name: string
+    scope: "articles" | "charts" | "all"
+    scores: Scores
+    numQueries: number
+    algoliaApp: string
+    algoliaIndex: string
+}
+
+const QUERY_FILES = {
+    single: "synthetic-queries-single-2024-03-25.json",
+    multi: "synthetic-queries-2024-03-25.json",
+}
+
+const main = async (): Promise<void> => {
+    // only do the multi, since it contains the single-word set as well
+    await evaluateAndPrint(QUERY_FILES.multi)
+}
+
+const evaluateAndPrint = async (name: string): Promise<void> => {
+    const results = await evaluateArticleSearch(name)
+    console.log(JSON.stringify(results, null, 2))
+}
+
+const evaluateArticleSearch = async (name: string): Promise<SearchResults> => {
+    const ds = await fetchQueryDataset(name)
+    const indexName = getIndexName("pages")
+
+    // make a search client
+    const client = getClient()
+    const index = client.initIndex(indexName)
+
+    // run the evaluation
+    const results = await simulateQueries(index, ds.queries)
+    const scores: Scores = {}
+    for (const scoreName of Object.keys(results[0].scores)) {
+        const mean =
+            results.map((r) => r.scores[scoreName]).reduce((a, b) => a + b) /
+            results.length
+        scores[scoreName] = parseFloat(mean.toFixed(3))
+    }
+
+    // print the results to two decimal places
+    return {
+        name: ds.name,
+        scope: "articles",
+        scores: scores,
+        numQueries: ds.queries.length,
+        algoliaApp: ALGOLIA_ID,
+        algoliaIndex: indexName,
+    }
+}
+
+const getClient = (): any => {
-const getClient = (): any => {
+const getClient = (): SearchClient => {
-const getClient = (): any => {
+const getClient = (): SearchClient => {
+    const client = algoliasearch(ALGOLIA_ID, ALGOLIA_SEARCH_KEY)
+    return client
+}
+
+const fetchQueryDataset = async (name: string): Promise<QueryDataset> => {
+    const url: string = `${SEARCH_EVAL_URL}/${name}`
+    const resp = await fetch(url)
+    const jsonData = await resp.json()
+    return { name, queries: jsonData }
+}
+
+const simulateQuery = async (
+    index: any,
-    index: any,
+    index: SearchIndex,
-    index: any,
+    index: SearchIndex,
+    query: Query
+): Promise<ScoredQuery> => {
+    const { hits } = await index.search(query.query)
-    const { hits } = await index.search(query.query)
+    const { hits } = await index.search(query.query, {
+    	attributesToRetrieve: ["slug"],
+        hitsPerPage: N_ARTICLES_LONG_RESULTS,
+    })
-    const { hits } = await index.search(query.query)
+    const { hits } = await index.search(query.query, {
+    	attributesToRetrieve: ["slug"],
+        hitsPerPage: N_ARTICLES_LONG_RESULTS,
+    })
+    const actual = hits.map((h: any) => h.slug)
+    const scores = scoreResults(query.slugs, actual)
+    return { query: query.query, expected: query.slugs, actual, scores }
+}
+
+const scoreResults = (relevant: string[], actual: string[]): Scores => {
+    const scores: Scores = {}
+
+    for (const k of [N_ARTICLES_QUICK_RESULTS, N_ARTICLES_LONG_RESULTS]) {
+        const key = `precision@${k}`
+        const actualTruncated = actual.slice(0, k)
+        const n = actualTruncated.length
+        if (n === 0) {
+            scores[key] = 0
+            continue
+        }
+
+        const correct = actualTruncated.filter((a) =>
+            relevant.includes(a)
+        ).length
+        scores[key] = correct / n
+    }
+    return scores
+}
+
+const simulateQueries = async (
+    index: any,
-    index: any,
+    index: SearchIndex,
-    index: any,
+    index: SearchIndex,
+    queries: Query[]
+): Promise<ScoredQuery[]> => {
+    // NOTE: should be a rate-limited version of:
+    //
+    // const scores = await Promise.all(
+    //     queries.map((query) => simulateQuery(index, query))
+    // )
+
+    let activeQueries = 0
+    let i = 0
+    const scores: ScoredQuery[] = []
+
+    const next = async () => {
+        if (i >= queries.length) return
+        const query = queries[i++]
+        activeQueries++
+        const score = await simulateQuery(index, query)
+        scores.push(score)
+        activeQueries--
+        if (i < queries.length) {
+            await next()
+        }
+    }
+
+    const promises = []
+    while (activeQueries < CONCURRENT_QUERIES && i < queries.length) {
+        promises.push(next())
+    }
+
+    await Promise.all(promises)
-    let activeQueries = 0
-    let i = 0
-    const scores: ScoredQuery[] = []
-
-    const next = async () => {
-        if (i >= queries.length) return
-        const query = queries[i++]
-        activeQueries++
-        const score = await simulateQuery(index, query)
-        scores.push(score)
-        activeQueries--
-        if (i < queries.length) {
-            await next()
-        }
-    }
-
-    const promises = []
-    while (activeQueries < CONCURRENT_QUERIES && i < queries.length) {
-        promises.push(next())
-    }
-
-    await Promise.all(promises)
+    scores = await pMap(queries, (query) => simulateQuery(index, query), { concurrency: CONCURRENT_QUERIES })
-    let activeQueries = 0
-    let i = 0
-    const scores: ScoredQuery[] = []
-
-    const next = async () => {
-        if (i >= queries.length) return
-        const query = queries[i++]
-        activeQueries++
-        const score = await simulateQuery(index, query)
-        scores.push(score)
-        activeQueries--
-        if (i < queries.length) {
-            await next()
-        }
-    }
-
-    const promises = []
-    while (activeQueries < CONCURRENT_QUERIES && i < queries.length) {
-        promises.push(next())
-    }
-
-    await Promise.all(promises)
+    scores = await pMap(queries, (query) => simulateQuery(index, query), { concurrency: CONCURRENT_QUERIES })
+
+    return scores
+}
+
+main()