Skip to content

Commit

Permalink
enhance(search): refine entity-picking logic
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelgerber committed Apr 10, 2024
1 parent cbfffd5 commit c19f5e2
Showing 1 changed file with 25 additions and 13 deletions.
38 changes: 25 additions & 13 deletions site/search/SearchUtils.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { HitAttributeHighlightResult } from "instantsearch.js/es/types/results.js"
import { IChartHit } from "./searchTypes.js"
import { EntityName } from "@ourworldindata/types"
import { removeTrailingParenthetical } from "@ourworldindata/utils"
import { countries, removeTrailingParenthetical } from "@ourworldindata/utils"

const removeHighlightTags = (text: string) =>
text.replace(/<\/?(mark|strong)>/g, "")
Expand All @@ -12,26 +12,38 @@ export function pickEntitiesForChartHit(hit: IChartHit): EntityName[] {

const pickedEntities = availableEntitiesHighlighted
?.filter((highlightEntry) => {
// Keep the highlight if it is fully highlighted
if (highlightEntry.fullyHighlighted) return true
if (highlightEntry.matchLevel === "none") return false

// Remove any trailing parentheses, e.g. "Africa (UN)" -> "Africa"
const withoutTrailingParens = removeTrailingParenthetical(
const entityNameWithoutTrailingParens = removeTrailingParenthetical(
removeHighlightTags(highlightEntry.value)
)

const matchedWordsLowerCase = highlightEntry.matchedWords.map(
(mw) => mw.toLowerCase()
// The sequence of words that Algolia matched; could be something like ["arab", "united", "republic"]
// which we want to check against the entity name
const matchedSequenceLowerCase = highlightEntry.matchedWords
.join(" ")
.toLowerCase()

// Pick entity if the matched sequence contains the full entity name
if (
matchedSequenceLowerCase.includes(
entityNameWithoutTrailingParens.toLowerCase()
)
)
return true

// Keep the highlight if every word (except for trailing parens) is fully highlighted
// This will also highlight "Central African Republic" when searching for "african central republic",
// but that's probably okay
return withoutTrailingParens
.toLowerCase()
.split(" ")
.every((w) => matchedWordsLowerCase.includes(w))
const country = countries.find(
(c) => c.name === entityNameWithoutTrailingParens
)
if (country?.variantNames) {
// Pick entity if the matched sequence contains any of the variant names
return country.variantNames.some((variant) =>
matchedSequenceLowerCase.includes(variant.toLowerCase())
)
}

return false
})
.map((highlightEntry) => removeHighlightTags(highlightEntry.value))

Expand Down

0 comments on commit c19f5e2

Please sign in to comment.