Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(search): match geographic entities in search #3388

Merged
2 changes: 2 additions & 0 deletions .env.example-full
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ IMAGE_HOSTING_R2_SECRET_ACCESS_KEY='' # optional

OPENAI_API_KEY=''

GRAPHER_DYNAMIC_THUMBNAIL_URL='' # optional; can set this to https://ourworldindata.org/grapher/thumbnail to use the live thumbnail worker

# enable search (readonly)
ALGOLIA_ID='' # optional
ALGOLIA_SEARCH_KEY='' # optional
Expand Down
23 changes: 16 additions & 7 deletions baker/algolia/configureAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {
ALGOLIA_INDEXING,
ALGOLIA_SECRET_KEY,
} from "../../settings/serverSettings.js"
import { countries, regions } from "@ourworldindata/utils"
import { countries, regions, excludeUndefined } from "@ourworldindata/utils"
import { SearchIndexName } from "../../site/search/searchTypes.js"
import { getIndexName } from "../../site/search/searchClient.js"

Expand Down Expand Up @@ -296,12 +296,6 @@ export const configureAlgolia = async () => {
["solar", "photovoltaic", "photovoltaics", "pv"],
]

// Send all our country variant names to algolia as synonyms
for (const country of countries) {
if (country.variantNames)
synonyms.push([country.name].concat(country.variantNames))
}

const algoliaSynonyms = synonyms.map((s) => {
return {
objectID: s.join("-"),
Expand All @@ -310,6 +304,21 @@ export const configureAlgolia = async () => {
} as Synonym
})

// Send all our country variant names to algolia as one-way synonyms
for (const country of countries) {
const alternatives = excludeUndefined([
country.shortName,
...(country.variantNames ?? []),
])
for (const alternative of alternatives)
algoliaSynonyms.push({
objectID: `${alternative}->${country.name}`,
type: "oneWaySynonym",
input: alternative,
synonyms: [country.name],
})
}

await pagesIndex.saveSynonyms(algoliaSynonyms, {
replaceExistingSynonyms: true,
})
Expand Down
35 changes: 34 additions & 1 deletion baker/algolia/indexChartsToAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import {
OwidGdocLinkType,
excludeNullish,
isNil,
countries,
orderBy,
removeTrailingParenthetical,
} from "@ourworldindata/utils"
import { MarkdownTextWrap } from "@ourworldindata/components"
import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js"
Expand All @@ -20,6 +23,35 @@ const computeScore = (record: Omit<ChartRecord, "score">): number => {
return numRelatedArticles * 500 + views_7d
}

const countriesWithVariantNames = new Set(
countries
.filter((country) => country.variantNames?.length || country.shortName)
.map((country) => country.name)
)

const processAvailableEntities = (availableEntities: string[] | null) => {
if (!availableEntities) return []

// Algolia is a bit weird with synonyms:
// If we have a synonym "USA" -> "United States", and we search for "USA",
// then it seems that Algolia can only find that within `availableEntities`
// if "USA" is within the first 100-or-so entries of the array.
// So, the easy solution is to sort the entities to ensure that countries
// with variant names are at the top.
// - @marcelgerber, 2024-03-25
return orderBy(
availableEntities,
[
(entityName) =>
countriesWithVariantNames.has(
removeTrailingParenthetical(entityName)
),
(entityName) => entityName,
],
["desc", "asc"]
)
}

const getChartsRecords = async (
knex: db.KnexReadonlyTransaction
): Promise<ChartRecord[]> => {
Expand Down Expand Up @@ -81,14 +113,15 @@ const getChartsRecords = async (
if (c.entityNames.length < 12000)
c.entityNames = excludeNullish(
JSON.parse(c.entityNames as string) as (string | null)[]
)
) as string[]
else {
console.info(
`Chart ${c.id} has too many entities, skipping its entities`
)
c.entityNames = []
}
}
c.entityNames = processAvailableEntities(c.entityNames)

c.tags = JSON.parse(c.tags)
c.keyChartForTags = JSON.parse(c.keyChartForTags as string).filter(
Expand Down
6 changes: 6 additions & 0 deletions packages/@ourworldindata/utils/src/Util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1856,6 +1856,12 @@ export function cartesian<T>(matrix: T[][]): T[][] {
)
}

// Remove any parenthetical content from _the end_ of a string
// E.g. "Africa (UN)" -> "Africa"
export function removeTrailingParenthetical(str: string): string {
return str.replace(/\s*\(.*\)$/, "")
Fixed Show fixed Hide fixed

Check failure

Code scanning / CodeQL

Polynomial regular expression used on uncontrolled data

This [regular expression](1) that depends on [library input](2) may run slow on strings with many repetitions of ' '. This [regular expression](3) that depends on [library input](2) may run slow on strings starting with '(' and with many repetitions of '('.
}

export function isElementHidden(element: Element | null): boolean {
if (!element) return false
const computedStyle = window.getComputedStyle(element)
Expand Down
1 change: 1 addition & 0 deletions packages/@ourworldindata/utils/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ export {
checkIsDataInsight,
checkIsAuthor,
cartesian,
removeTrailingParenthetical,
isElementHidden,
} from "./Util.js"

Expand Down
4 changes: 4 additions & 0 deletions settings/clientSettings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ export const BAKED_GRAPHER_EXPORTS_BASE_URL: string =
export const BAKED_SITE_EXPORTS_BASE_URL: string =
process.env.BAKED_SITE_EXPORTS_BASE_URL ?? `${BAKED_BASE_URL}/exports`

export const GRAPHER_DYNAMIC_THUMBNAIL_URL: string =
process.env.GRAPHER_DYNAMIC_THUMBNAIL_URL ??
`${BAKED_GRAPHER_URL}/thumbnail`

export const ADMIN_BASE_URL: string =
process.env.ADMIN_BASE_URL ??
`http://${ADMIN_SERVER_HOST}:${ADMIN_SERVER_PORT}`
Expand Down
18 changes: 18 additions & 0 deletions site/search/Search.scss
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,24 @@ $reset-button-margin: 16px;
}
}

.search-results__chart-hit-entities {
list-style: none;
font-size: 0.8em;

li {
display: inline;
color: $blue-50;

&::after {
content: ", ";
}

&:last-child::after {
content: "";
}
}
}

/*
* Tabs / Filtering
**/
Expand Down
34 changes: 31 additions & 3 deletions site/search/SearchPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ import {
ALGOLIA_ID,
ALGOLIA_SEARCH_KEY,
BAKED_BASE_URL,
BAKED_GRAPHER_EXPORTS_BASE_URL,
BAKED_GRAPHER_URL,
GRAPHER_DYNAMIC_THUMBNAIL_URL,
} from "../../settings/clientSettings.js"
import { action, observable } from "mobx"
import { observer } from "mobx-react"
Expand Down Expand Up @@ -68,7 +70,10 @@ import {
} from "@ourworldindata/grapher"
import type { SearchResults as AlgoliaSearchResultsType } from "algoliasearch-helper"
import { SiteAnalytics } from "../SiteAnalytics.js"
import { extractRegionNamesFromSearchQuery } from "./SearchUtils.js"
import {
extractRegionNamesFromSearchQuery,
pickEntitiesForChartHit,
} from "./SearchUtils.js"

const siteAnalytics = new SiteAnalytics()

Expand Down Expand Up @@ -119,9 +124,24 @@ function ChartHit({ hit }: { hit: IChartHit }) {
const [imgLoaded, setImgLoaded] = useState(false)
const [imgError, setImgError] = useState(false)

const entities = useMemo(
() => pickEntitiesForChartHit(hit),
// eslint-disable-next-line react-hooks/exhaustive-deps
[hit._highlightResult?.availableEntities]
)
const queryStr = useMemo(() => getEntityQueryStr(entities), [entities])
const previewUrl = queryStr
? `${GRAPHER_DYNAMIC_THUMBNAIL_URL}/${hit.slug}${queryStr}`
: `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${hit.slug}.svg`

useEffect(() => {
setImgLoaded(false)
setImgError(false)
}, [previewUrl])

return (
<a
href={`${BAKED_GRAPHER_URL}/${hit.slug}`}
href={`${BAKED_GRAPHER_URL}/${hit.slug}${queryStr}`}
data-algolia-index={getIndexName(SearchIndexName.Charts)}
data-algolia-object-id={hit.objectID}
data-algolia-position={hit.__position}
Expand All @@ -134,11 +154,12 @@ function ChartHit({ hit }: { hit: IChartHit }) {
</div>
)}
<img
key={previewUrl}
className={cx({ loaded: imgLoaded, error: imgError })}
loading="lazy"
width={DEFAULT_GRAPHER_WIDTH}
height={DEFAULT_GRAPHER_HEIGHT}
src={`${BAKED_GRAPHER_URL}/exports/${hit.slug}.svg`}
src={previewUrl}
onLoad={() => setImgLoaded(true)}
onError={() => setImgError(true)}
/>
Expand All @@ -154,6 +175,13 @@ function ChartHit({ hit }: { hit: IChartHit }) {
{hit.variantName}
</span>
</div>
{entities.length > 0 && (
<ul className="search-results__chart-hit-entities">
{entities.map((entity) => (
<li key={entity}>{entity}</li>
))}
</ul>
)}
</a>
)
}
Expand Down
55 changes: 55 additions & 0 deletions site/search/SearchUtils.tsx
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import { HitAttributeHighlightResult } from "instantsearch.js"
import { IChartHit } from "./searchTypes.js"
import { EntityName } from "@ourworldindata/types"
import {
Region,
getRegionByNameOrVariantName,
regions,
countries,
escapeRegExp,
removeTrailingParenthetical,
} from "@ourworldindata/utils"

const allCountryNamesAndVariants = regions.flatMap((c) => [
Expand All @@ -22,3 +27,53 @@ export const extractRegionNamesFromSearchQuery = (query: string) => {
if (regionNames.length === 0) return null
return regionNames.map(getRegionByNameOrVariantName) as Region[]
}

const removeHighlightTags = (text: string) =>
text.replace(/<\/?(mark|strong)>/g, "")

export function pickEntitiesForChartHit(hit: IChartHit): EntityName[] {
const availableEntitiesHighlighted = hit._highlightResult
?.availableEntities as HitAttributeHighlightResult[] | undefined

const pickedEntities = availableEntitiesHighlighted
?.filter((highlightEntry) => {
if (highlightEntry.matchLevel === "none") return false

// Remove any trailing parentheses, e.g. "Africa (UN)" -> "Africa"
const entityNameWithoutTrailingParens = removeTrailingParenthetical(
removeHighlightTags(highlightEntry.value)
)

// The sequence of words that Algolia matched; could be something like ["arab", "united", "republic"]
// which we want to check against the entity name
const matchedSequenceLowerCase = highlightEntry.matchedWords
.join(" ")
.toLowerCase()

// Pick entity if the matched sequence contains the full entity name
if (
matchedSequenceLowerCase.startsWith(
entityNameWithoutTrailingParens
.replaceAll("-", " ") // makes "high-income countries" into "high income countries", enabling a match
.toLowerCase()
)
)
return true

const country = countries.find(
(c) => c.name === entityNameWithoutTrailingParens
)
if (country?.variantNames) {
// Pick entity if the matched sequence contains any of the variant names
return country.variantNames.some((variant) =>
matchedSequenceLowerCase.includes(variant.toLowerCase())
)
}

return false
})
.map((highlightEntry) => removeHighlightTags(highlightEntry.value))
.sort()

return pickedEntities ?? []
}