Skip to content

Commit

Permalink
Merge pull request #3428 from owid/search-explorer-indexing
Browse files Browse the repository at this point in the history
feat(algolia): index explorer views to Algolia
  • Loading branch information
marcelgerber authored Apr 11, 2024
2 parents 6be7d58 + 7dbbc49 commit f364c91
Show file tree
Hide file tree
Showing 6 changed files with 325 additions and 7 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ reindex: itsJustJavascript
node --enable-source-maps itsJustJavascript/baker/algolia/indexToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexChartsToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorersToAlgolia.js
node --enable-source-maps itsJustJavascript/baker/algolia/indexExplorerViewsToAlgolia.js

clean:
rm -rf node_modules itsJustJavascript
26 changes: 26 additions & 0 deletions baker/algolia/configureAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,29 @@ export const configureAlgolia = async () => {
disableTypoToleranceOnAttributes: ["text"],
})

const explorerViewsIndex = client.initIndex(
getIndexName(SearchIndexName.ExplorerViews)
)

await explorerViewsIndex.setSettings({
...baseSettings,
searchableAttributes: [
"unordered(explorerTitle)",
"unordered(viewTitle)",
"unordered(viewSettings)",
],
customRanking: [
// For multiple explorer views with the same title, we want to avoid surfacing duplicates.
// So, rank a result with viewTitleIndexWithinExplorer=0 way more highly than one with 1, 2, etc.
"asc(viewTitleIndexWithinExplorer)",
"desc(score)",
"asc(viewIndexWithinExplorer)",
],
attributeForDistinct: "explorerSlug",
distinct: 4,
minWordSizefor1Typo: 6,
})

const synonyms = [
["owid", "our world in data"],
["kids", "children"],
Expand Down Expand Up @@ -308,6 +331,9 @@ export const configureAlgolia = async () => {
await explorersIndex.saveSynonyms(algoliaSynonyms, {
replaceExistingSynonyms: true,
})
await explorerViewsIndex.saveSynonyms(algoliaSynonyms, {
replaceExistingSynonyms: true,
})

if (TOPICS_CONTENT_GRAPH) {
const graphIndex = client.initIndex(CONTENT_GRAPH_ALGOLIA_INDEX)
Expand Down
289 changes: 289 additions & 0 deletions baker/algolia/indexExplorerViewsToAlgolia.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
import * as db from "../../db/db.js"
import { ExplorerBlockGraphers } from "./indexExplorersToAlgolia.js"
import { DecisionMatrix } from "../../explorer/ExplorerDecisionMatrix.js"
import { tsvFormat } from "d3-dsv"
import {
ExplorerChoiceParams,
ExplorerControlType,
} from "../../explorer/ExplorerConstants.js"
import { GridBoolean } from "../../gridLang/GridLangConstants.js"
import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js"
import { ALGOLIA_INDEXING } from "../../settings/serverSettings.js"
import { getAlgoliaClient } from "./configureAlgolia.js"
import { getIndexName } from "../../site/search/searchClient.js"
import { SearchIndexName } from "../../site/search/searchTypes.js"
import { groupBy, keyBy, orderBy } from "lodash"
import { MarkdownTextWrap } from "@ourworldindata/components"

interface ExplorerViewEntry {
viewTitle: string
viewSubtitle: string
viewSettings: string[]
viewQueryParams: string

viewGrapherId?: number

/**
* We often have several views with the same title within an explorer, e.g. "Population".
* In order to only display _one_ of these views in search results, we need a way to demote duplicates.
* This attribute is used for that: The highest-scored such view will be given a value of 0, the second-highest 1, etc.
*/
viewTitleIndexWithinExplorer: number

// Potential ranking criteria
viewIndexWithinExplorer: number
titleLength: number
numNonDefaultSettings: number
// viewViews_7d: number
}

interface ExplorerViewEntryWithExplorerInfo extends ExplorerViewEntry {
explorerSlug: string
explorerTitle: string
explorerSubtitle: string
explorerViews_7d: number
viewTitleAndExplorerSlug: string // used for deduplication: `viewTitle | explorerSlug`
numViewsWithinExplorer: number

score: number

objectID?: string
}

// Creates a search-ready string from a choice.
// Special handling is pretty much only necessary for checkboxes: If they are not ticked, then their name is not included.
// Imagine a "Per capita" checkbox, for example. If it's not ticked, then we don't want searches for "per capita" to wrongfully match it.
const explorerChoiceToViewSettings = (
choices: ExplorerChoiceParams,
decisionMatrix: DecisionMatrix
): string[] => {
return Object.entries(choices).map(([choiceName, choiceValue]) => {
const choiceControlType =
decisionMatrix.choiceNameToControlTypeMap.get(choiceName)
if (choiceControlType === ExplorerControlType.Checkbox)
return choiceValue === GridBoolean.true ? choiceName : ""
else return choiceValue
})
}

const computeScore = (
record: Omit<ExplorerViewEntry, "viewTitleIndexWithinExplorer"> &
Partial<ExplorerViewEntryWithExplorerInfo>
) =>
(record.explorerViews_7d ?? 0) * 10 -
record.numNonDefaultSettings * 50 -
record.titleLength

const getExplorerViewRecordsForExplorerSlug = async (
trx: db.KnexReadonlyTransaction,
slug: string
): Promise<ExplorerViewEntry[]> => {
const explorerConfig = await trx
.table("explorers")
.select("config")
.where({ slug })
.first()
.then((row) => JSON.parse(row.config) as any)

const explorerGrapherBlock: ExplorerBlockGraphers =
explorerConfig.blocks.filter(
(block: any) => block.type === "graphers"
)[0] as ExplorerBlockGraphers

if (explorerGrapherBlock === undefined)
throw new Error(`Explorer ${slug} has no grapher block`)

// TODO: Maybe make DecisionMatrix accept JSON directly
const tsv = tsvFormat(explorerGrapherBlock.block)
const explorerDecisionMatrix = new DecisionMatrix(tsv)

console.log(
`Processing explorer ${slug} (${explorerDecisionMatrix.numRows} rows)`
)

const defaultSettings = explorerDecisionMatrix.defaultSettings

const records = explorerDecisionMatrix
.allDecisionsAsQueryParams()
.map((choice, i) => {
explorerDecisionMatrix.setValuesFromChoiceParams(choice)

// Check which choices are non-default, i.e. are not the first available option in a dropdown/radio
const nonDefaultSettings = Object.entries(
explorerDecisionMatrix.availableChoiceOptions
).filter(([choiceName, choiceOptions]) => {
// Keep only choices which are not the default, which is:
// - either the options marked as `default` in the decision matrix
// - or the first available option in the decision matrix
return (
choiceOptions.length > 1 &&
!(defaultSettings[choiceName] !== undefined
? defaultSettings[choiceName] === choice[choiceName]
: choice[choiceName] === choiceOptions[0])
)
})

const record: Omit<
ExplorerViewEntry,
"viewTitleIndexWithinExplorer"
> = {
viewTitle: explorerDecisionMatrix.selectedRow.title,
viewSubtitle: explorerDecisionMatrix.selectedRow.subtitle,
viewSettings: explorerChoiceToViewSettings(
choice,
explorerDecisionMatrix
),
viewGrapherId: explorerDecisionMatrix.selectedRow.grapherId,
viewQueryParams: explorerDecisionMatrix.toString(),

viewIndexWithinExplorer: i,
titleLength: explorerDecisionMatrix.selectedRow.title?.length,
numNonDefaultSettings: nonDefaultSettings.length,
}
return record
})

// Enrich `grapherId`-powered views with title/subtitle
const grapherIds = records
.filter((record) => record.viewGrapherId !== undefined)
.map((record) => record.viewGrapherId as number)

if (grapherIds.length) {
console.log(
`Fetching grapher info from ${grapherIds.length} graphers for explorer ${slug}`
)
const grapherIdToTitle = await trx
.table("charts")
.select(
"id",
trx.raw("config->>'$.title' as title"),
trx.raw("config->>'$.subtitle' as subtitle")
)
.whereIn("id", grapherIds)
.andWhereRaw("config->>'$.isPublished' = 'true'")
.then((rows) => keyBy(rows, "id"))

for (const record of records) {
if (record.viewGrapherId !== undefined) {
const grapherInfo = grapherIdToTitle[record.viewGrapherId]
if (grapherInfo === undefined) {
console.warn(
`Grapher id ${record.viewGrapherId} not found for explorer ${slug}`
)
continue
}
record.viewTitle = grapherInfo.title
record.viewSubtitle = grapherInfo.subtitle
record.titleLength = grapherInfo.title?.length
}
}
}

// Remove Markdown from viewSubtitle; do this after fetching grapher info above, as it might also contain Markdown
records.forEach((record) => {
if (record.viewSubtitle) {
record.viewSubtitle = new MarkdownTextWrap({
text: record.viewSubtitle,
fontSize: 10, // doesn't matter, but is a mandatory field
}).plaintext
}
})

// Compute viewTitleIndexWithinExplorer:
// First, sort by score descending (ignoring views_7d, which is not relevant _within_ an explorer).
// Then, group by viewTitle.
// Finally, ungroup again, and keep track of the index of each element within the group.
const recordsSortedByScore = orderBy(
records,
(record) => computeScore(record),
"desc"
)
const recordsGroupedByViewTitle = groupBy(recordsSortedByScore, "viewTitle")
const recordsWithIndexWithinExplorer = Object.values(
recordsGroupedByViewTitle
).flatMap((recordsGroup) =>
recordsGroup.map((record, i) => ({
...record,
viewTitleIndexWithinExplorer: i,
}))
)

// TODO: Handle indicator-based explorers

return recordsWithIndexWithinExplorer
}

const getExplorerViewRecords = async (
trx: db.KnexReadonlyTransaction
): Promise<ExplorerViewEntryWithExplorerInfo[]> => {
const publishedExplorers = Object.values(
await db.getPublishedExplorersBySlug(trx)
)

const pageviews = await getAnalyticsPageviewsByUrlObj(trx)

let records = [] as ExplorerViewEntryWithExplorerInfo[]
for (const explorerInfo of publishedExplorers) {
const explorerViewRecords = await getExplorerViewRecordsForExplorerSlug(
trx,
explorerInfo.slug
)

const explorerPageviews =
pageviews[`/explorers/${explorerInfo.slug}`]?.views_7d ?? 0
const unscoredRecords = explorerViewRecords.map(
(record, i): Omit<ExplorerViewEntryWithExplorerInfo, "score"> => ({
...record,
explorerSlug: explorerInfo.slug,
explorerTitle: explorerInfo.title,
explorerSubtitle: explorerInfo.subtitle,
explorerViews_7d: explorerPageviews,
viewTitleAndExplorerSlug: `${record.viewTitle} | ${explorerInfo.slug}`,
numViewsWithinExplorer: explorerViewRecords.length,

objectID: `${explorerInfo.slug}-${i}`,
})
)
records = records.concat(
unscoredRecords.map((record) => ({
...record,
score: computeScore(record),
}))
)
}

return records
}

const indexExplorerViewsToAlgolia = async () => {
if (!ALGOLIA_INDEXING) return

const client = getAlgoliaClient()
if (!client) {
console.error(
`Failed indexing explorer views (Algolia client not initialized)`
)
return
}

try {
const index = client.initIndex(
getIndexName(SearchIndexName.ExplorerViews)
)

const records = await db.knexReadonlyTransaction(
getExplorerViewRecords,
db.TransactionCloseMode.Close
)
await index.replaceAllObjects(records)
} catch (e) {
console.log("Error indexing explorer views to Algolia:", e)
}
}

process.on("unhandledRejection", (e) => {
console.error(e)
process.exit(1)
})

void indexExplorerViewsToAlgolia()
2 changes: 1 addition & 1 deletion baker/algolia/indexExplorersToAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ type ExplorerBlockColumns = {
block: { name: string; additionalInfo?: string }[]
}

type ExplorerBlockGraphers = {
export type ExplorerBlockGraphers = {
type: "graphers"
block: {
title?: string
Expand Down
Loading

0 comments on commit f364c91

Please sign in to comment.