Skip to content

Commit

Permalink
feat(search): match geographic entities in search (#3388)
Browse files Browse the repository at this point in the history
* chore(search): make country name variants one-way synonyms

* feat(search): match geographic entities within search

* fix(algolia): sort entity names with variant names first, so Algolia synonyms can work

* enhance(search): refine entity-picking logic

* enhance(search): sort entity names

* perf(algolia): optimize chart indexing code a bit

* enhance(search): show entities as comma-separated list

* enhance(search): use settings for URLs

* fix(search): recognize strings like "high-income countries"
  • Loading branch information
marcelgerber authored Apr 16, 2024
1 parent f7e23da commit 7f88da5
Show file tree
Hide file tree
Showing 9 changed files with 167 additions and 11 deletions.
2 changes: 2 additions & 0 deletions .env.example-full
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ IMAGE_HOSTING_R2_SECRET_ACCESS_KEY='' # optional

OPENAI_API_KEY=''

GRAPHER_DYNAMIC_THUMBNAIL_URL='' # optional; can set this to https://ourworldindata.org/grapher/thumbnail to use the live thumbnail worker

# enable search (readonly)
ALGOLIA_ID='' # optional
ALGOLIA_SEARCH_KEY='' # optional
Expand Down
23 changes: 16 additions & 7 deletions baker/algolia/configureAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {
ALGOLIA_INDEXING,
ALGOLIA_SECRET_KEY,
} from "../../settings/serverSettings.js"
import { countries, regions } from "@ourworldindata/utils"
import { countries, regions, excludeUndefined } from "@ourworldindata/utils"
import { SearchIndexName } from "../../site/search/searchTypes.js"
import { getIndexName } from "../../site/search/searchClient.js"

Expand Down Expand Up @@ -296,12 +296,6 @@ export const configureAlgolia = async () => {
["solar", "photovoltaic", "photovoltaics", "pv"],
]

// Send all our country variant names to algolia as synonyms
for (const country of countries) {
if (country.variantNames)
synonyms.push([country.name].concat(country.variantNames))
}

const algoliaSynonyms = synonyms.map((s) => {
return {
objectID: s.join("-"),
Expand All @@ -310,6 +304,21 @@ export const configureAlgolia = async () => {
} as Synonym
})

// Send all our country variant names to algolia as one-way synonyms
for (const country of countries) {
const alternatives = excludeUndefined([
country.shortName,
...(country.variantNames ?? []),
])
for (const alternative of alternatives)
algoliaSynonyms.push({
objectID: `${alternative}->${country.name}`,
type: "oneWaySynonym",
input: alternative,
synonyms: [country.name],
})
}

await pagesIndex.saveSynonyms(algoliaSynonyms, {
replaceExistingSynonyms: true,
})
Expand Down
35 changes: 34 additions & 1 deletion baker/algolia/indexChartsToAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import {
OwidGdocLinkType,
excludeNullish,
isNil,
countries,
orderBy,
removeTrailingParenthetical,
} from "@ourworldindata/utils"
import { MarkdownTextWrap } from "@ourworldindata/components"
import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js"
Expand All @@ -20,6 +23,35 @@ const computeScore = (record: Omit<ChartRecord, "score">): number => {
return numRelatedArticles * 500 + views_7d
}

const countriesWithVariantNames = new Set(
countries
.filter((country) => country.variantNames?.length || country.shortName)
.map((country) => country.name)
)

const processAvailableEntities = (availableEntities: string[] | null) => {
if (!availableEntities) return []

// Algolia is a bit weird with synonyms:
// If we have a synonym "USA" -> "United States", and we search for "USA",
// then it seems that Algolia can only find that within `availableEntities`
// if "USA" is within the first 100-or-so entries of the array.
// So, the easy solution is to sort the entities to ensure that countries
// with variant names are at the top.
// - @marcelgerber, 2024-03-25
return orderBy(
availableEntities,
[
(entityName) =>
countriesWithVariantNames.has(
removeTrailingParenthetical(entityName)
),
(entityName) => entityName,
],
["desc", "asc"]
)
}

const getChartsRecords = async (
knex: db.KnexReadonlyTransaction
): Promise<ChartRecord[]> => {
Expand Down Expand Up @@ -81,14 +113,15 @@ const getChartsRecords = async (
if (c.entityNames.length < 12000)
c.entityNames = excludeNullish(
JSON.parse(c.entityNames as string) as (string | null)[]
)
) as string[]
else {
console.info(
`Chart ${c.id} has too many entities, skipping its entities`
)
c.entityNames = []
}
}
c.entityNames = processAvailableEntities(c.entityNames)

c.tags = JSON.parse(c.tags)
c.keyChartForTags = JSON.parse(c.keyChartForTags as string).filter(
Expand Down
6 changes: 6 additions & 0 deletions packages/@ourworldindata/utils/src/Util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1856,6 +1856,12 @@ export function cartesian<T>(matrix: T[][]): T[][] {
)
}

// Remove any parenthetical content from _the end_ of a string
// E.g. "Africa (UN)" -> "Africa"
export function removeTrailingParenthetical(str: string): string {
return str.replace(/\s*\(.*\)$/, "")
}

export function isElementHidden(element: Element | null): boolean {
if (!element) return false
const computedStyle = window.getComputedStyle(element)
Expand Down
1 change: 1 addition & 0 deletions packages/@ourworldindata/utils/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ export {
checkIsDataInsight,
checkIsAuthor,
cartesian,
removeTrailingParenthetical,
isElementHidden,
} from "./Util.js"

Expand Down
4 changes: 4 additions & 0 deletions settings/clientSettings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ export const BAKED_GRAPHER_EXPORTS_BASE_URL: string =
export const BAKED_SITE_EXPORTS_BASE_URL: string =
process.env.BAKED_SITE_EXPORTS_BASE_URL ?? `${BAKED_BASE_URL}/exports`

export const GRAPHER_DYNAMIC_THUMBNAIL_URL: string =
process.env.GRAPHER_DYNAMIC_THUMBNAIL_URL ??
`${BAKED_GRAPHER_URL}/thumbnail`

export const ADMIN_BASE_URL: string =
process.env.ADMIN_BASE_URL ??
`http://${ADMIN_SERVER_HOST}:${ADMIN_SERVER_PORT}`
Expand Down
18 changes: 18 additions & 0 deletions site/search/Search.scss
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,24 @@ $reset-button-margin: 16px;
}
}

.search-results__chart-hit-entities {
list-style: none;
font-size: 0.8em;

li {
display: inline;
color: $blue-50;

&::after {
content: ", ";
}

&:last-child::after {
content: "";
}
}
}

/*
* Tabs / Filtering
**/
Expand Down
34 changes: 31 additions & 3 deletions site/search/SearchPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ import {
ALGOLIA_ID,
ALGOLIA_SEARCH_KEY,
BAKED_BASE_URL,
BAKED_GRAPHER_EXPORTS_BASE_URL,
BAKED_GRAPHER_URL,
GRAPHER_DYNAMIC_THUMBNAIL_URL,
} from "../../settings/clientSettings.js"
import { action, observable } from "mobx"
import { observer } from "mobx-react"
Expand Down Expand Up @@ -68,7 +70,10 @@ import {
} from "@ourworldindata/grapher"
import type { SearchResults as AlgoliaSearchResultsType } from "algoliasearch-helper"
import { SiteAnalytics } from "../SiteAnalytics.js"
import { extractRegionNamesFromSearchQuery } from "./SearchUtils.js"
import {
extractRegionNamesFromSearchQuery,
pickEntitiesForChartHit,
} from "./SearchUtils.js"

const siteAnalytics = new SiteAnalytics()

Expand Down Expand Up @@ -119,9 +124,24 @@ function ChartHit({ hit }: { hit: IChartHit }) {
const [imgLoaded, setImgLoaded] = useState(false)
const [imgError, setImgError] = useState(false)

const entities = useMemo(
() => pickEntitiesForChartHit(hit),
// eslint-disable-next-line react-hooks/exhaustive-deps
[hit._highlightResult?.availableEntities]
)
const queryStr = useMemo(() => getEntityQueryStr(entities), [entities])
const previewUrl = queryStr
? `${GRAPHER_DYNAMIC_THUMBNAIL_URL}/${hit.slug}${queryStr}`
: `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${hit.slug}.svg`

useEffect(() => {
setImgLoaded(false)
setImgError(false)
}, [previewUrl])

return (
<a
href={`${BAKED_GRAPHER_URL}/${hit.slug}`}
href={`${BAKED_GRAPHER_URL}/${hit.slug}${queryStr}`}
data-algolia-index={getIndexName(SearchIndexName.Charts)}
data-algolia-object-id={hit.objectID}
data-algolia-position={hit.__position}
Expand All @@ -134,11 +154,12 @@ function ChartHit({ hit }: { hit: IChartHit }) {
</div>
)}
<img
key={previewUrl}
className={cx({ loaded: imgLoaded, error: imgError })}
loading="lazy"
width={DEFAULT_GRAPHER_WIDTH}
height={DEFAULT_GRAPHER_HEIGHT}
src={`${BAKED_GRAPHER_URL}/exports/${hit.slug}.svg`}
src={previewUrl}
onLoad={() => setImgLoaded(true)}
onError={() => setImgError(true)}
/>
Expand All @@ -154,6 +175,13 @@ function ChartHit({ hit }: { hit: IChartHit }) {
{hit.variantName}
</span>
</div>
{entities.length > 0 && (
<ul className="search-results__chart-hit-entities">
{entities.map((entity) => (
<li key={entity}>{entity}</li>
))}
</ul>
)}
</a>
)
}
Expand Down
55 changes: 55 additions & 0 deletions site/search/SearchUtils.tsx
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import { HitAttributeHighlightResult } from "instantsearch.js"
import { IChartHit } from "./searchTypes.js"
import { EntityName } from "@ourworldindata/types"
import {
Region,
getRegionByNameOrVariantName,
regions,
countries,
escapeRegExp,
removeTrailingParenthetical,
} from "@ourworldindata/utils"

const allCountryNamesAndVariants = regions.flatMap((c) => [
Expand All @@ -22,3 +27,53 @@ export const extractRegionNamesFromSearchQuery = (query: string) => {
if (regionNames.length === 0) return null
return regionNames.map(getRegionByNameOrVariantName) as Region[]
}

const removeHighlightTags = (text: string) =>
text.replace(/<\/?(mark|strong)>/g, "")

export function pickEntitiesForChartHit(hit: IChartHit): EntityName[] {
const availableEntitiesHighlighted = hit._highlightResult
?.availableEntities as HitAttributeHighlightResult[] | undefined

const pickedEntities = availableEntitiesHighlighted
?.filter((highlightEntry) => {
if (highlightEntry.matchLevel === "none") return false

// Remove any trailing parentheses, e.g. "Africa (UN)" -> "Africa"
const entityNameWithoutTrailingParens = removeTrailingParenthetical(
removeHighlightTags(highlightEntry.value)
)

// The sequence of words that Algolia matched; could be something like ["arab", "united", "republic"]
// which we want to check against the entity name
const matchedSequenceLowerCase = highlightEntry.matchedWords
.join(" ")
.toLowerCase()

// Pick entity if the matched sequence contains the full entity name
if (
matchedSequenceLowerCase.startsWith(
entityNameWithoutTrailingParens
.replaceAll("-", " ") // makes "high-income countries" into "high income countries", enabling a match
.toLowerCase()
)
)
return true

const country = countries.find(
(c) => c.name === entityNameWithoutTrailingParens
)
if (country?.variantNames) {
// Pick entity if the matched sequence contains any of the variant names
return country.variantNames.some((variant) =>
matchedSequenceLowerCase.includes(variant.toLowerCase())
)
}

return false
})
.map((highlightEntry) => removeHighlightTags(highlightEntry.value))
.sort()

return pickedEntities ?? []
}

0 comments on commit 7f88da5

Please sign in to comment.