Skip to content

Commit

Permalink
Merge branch 'algolia-geographic-entities' into dev-marcel-algolia
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelgerber committed Apr 15, 2024
2 parents 719eaab + d9e4bba commit 281c114
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 13 deletions.
23 changes: 16 additions & 7 deletions baker/algolia/configureAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {
ALGOLIA_INDEXING,
ALGOLIA_SECRET_KEY,
} from "../../settings/serverSettings.js"
import { countries, regions } from "@ourworldindata/utils"
import { countries, regions, excludeUndefined } from "@ourworldindata/utils"
import { SearchIndexName } from "../../site/search/searchTypes.js"
import { getIndexName } from "../../site/search/searchClient.js"

Expand Down Expand Up @@ -296,12 +296,6 @@ export const configureAlgolia = async () => {
["solar", "photovoltaic", "photovoltaics", "pv"],
]

// Send all our country variant names to algolia as synonyms
for (const country of countries) {
if (country.variantNames)
synonyms.push([country.name].concat(country.variantNames))
}

const algoliaSynonyms = synonyms.map((s) => {
return {
objectID: s.join("-"),
Expand All @@ -310,6 +304,21 @@ export const configureAlgolia = async () => {
} as Synonym
})

// Send all our country variant names to algolia as one-way synonyms
for (const country of countries) {
const alternatives = excludeUndefined([
country.shortName,
...(country.variantNames ?? []),
])
for (const alternative of alternatives)
algoliaSynonyms.push({
objectID: `${alternative}->${country.name}`,
type: "oneWaySynonym",
input: alternative,
synonyms: [country.name],
})
}

await pagesIndex.saveSynonyms(algoliaSynonyms, {
replaceExistingSynonyms: true,
})
Expand Down
35 changes: 34 additions & 1 deletion baker/algolia/indexChartsToAlgolia.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import {
OwidGdocLinkType,
excludeNullish,
isNil,
countries,
orderBy,
removeTrailingParenthetical,
} from "@ourworldindata/utils"
import { MarkdownTextWrap } from "@ourworldindata/components"
import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js"
Expand All @@ -20,6 +23,35 @@ const computeScore = (record: Omit<ChartRecord, "score">): number => {
return numRelatedArticles * 500 + views_7d
}

const countriesWithVariantNames = new Set(
countries
.filter((country) => country.variantNames?.length || country.shortName)
.map((country) => country.name)
)

const processAvailableEntities = (availableEntities: string[] | null) => {
if (!availableEntities) return []

// Algolia is a bit weird with synonyms:
// If we have a synonym "USA" -> "United States", and we search for "USA",
// then it seems that Algolia can only find that within `availableEntities`
// if "USA" is within the first 100-or-so entries of the array.
// So, the easy solution is to sort the entities to ensure that countries
// with variant names are at the top.
// - @marcelgerber, 2024-03-25
return orderBy(
availableEntities,
[
(entityName) =>
countriesWithVariantNames.has(
removeTrailingParenthetical(entityName)
),
(entityName) => entityName,
],
["desc", "asc"]
)
}

const getChartsRecords = async (
knex: db.KnexReadonlyTransaction
): Promise<ChartRecord[]> => {
Expand Down Expand Up @@ -81,14 +113,15 @@ const getChartsRecords = async (
if (c.entityNames.length < 12000)
c.entityNames = excludeNullish(
JSON.parse(c.entityNames as string) as (string | null)[]
)
) as string[]
else {
console.info(
`Chart ${c.id} has too many entities, skipping its entities`
)
c.entityNames = []
}
}
c.entityNames = processAvailableEntities(c.entityNames)

c.tags = JSON.parse(c.tags)
c.keyChartForTags = JSON.parse(c.keyChartForTags as string).filter(
Expand Down
6 changes: 6 additions & 0 deletions packages/@ourworldindata/utils/src/Util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1856,6 +1856,12 @@ export function cartesian<T>(matrix: T[][]): T[][] {
)
}

// Remove any parenthetical content from _the end_ of a string
// E.g. "Africa (UN)" -> "Africa"
export function removeTrailingParenthetical(str: string): string {
return str.replace(/\s*\(.*\)$/, "")

Check failure

Code scanning / CodeQL

Polynomial regular expression used on uncontrolled data High

This
regular expression
that depends on
library input
may run slow on strings with many repetitions of ' '.
This
regular expression
that depends on
library input
may run slow on strings starting with '(' and with many repetitions of '('.
}

export function isElementHidden(element: Element | null): boolean {
if (!element) return false
const computedStyle = window.getComputedStyle(element)
Expand Down
1 change: 1 addition & 0 deletions packages/@ourworldindata/utils/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ export {
checkIsDataInsight,
checkIsAuthor,
cartesian,
removeTrailingParenthetical,
isElementHidden,
} from "./Util.js"

Expand Down
19 changes: 19 additions & 0 deletions site/search/Search.scss
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,25 @@ $reset-button-margin: 16px;
}
}

.search-results__chart-hit-entities {
gap: 3px;
list-style: none;
font-size: 0.8em;

li {
display: inline;
color: $blue-50;

&::after {
content: ", ";
}

&:last-child::after {
content: "";
}
}
}

/*
* Tabs / Filtering
**/
Expand Down
36 changes: 31 additions & 5 deletions site/search/SearchPanel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ import {
get,
mapValues,
isElementHidden,
EntityName,
Url,
sortBy,
groupBy,
uniqBy,
EntityName,
Url,
Region,
} from "@ourworldindata/utils"
import {
Expand Down Expand Up @@ -66,9 +66,12 @@ import {
DEFAULT_GRAPHER_WIDTH,
setSelectedEntityNamesParam,
} from "@ourworldindata/grapher"
import {
pickEntitiesForChartHit,
extractRegionNamesFromSearchQuery,
} from "./SearchUtils.js"
import type { SearchResults as AlgoliaSearchResultsType } from "algoliasearch-helper"
import { SiteAnalytics } from "../SiteAnalytics.js"
import { extractRegionNamesFromSearchQuery } from "./SearchUtils.js"

const siteAnalytics = new SiteAnalytics()

Expand Down Expand Up @@ -119,9 +122,24 @@ function ChartHit({ hit }: { hit: IChartHit }) {
const [imgLoaded, setImgLoaded] = useState(false)
const [imgError, setImgError] = useState(false)

const entities = useMemo(
() => pickEntitiesForChartHit(hit),
// eslint-disable-next-line react-hooks/exhaustive-deps
[hit._highlightResult?.availableEntities]
)
const queryStr = useMemo(() => getEntityQueryStr(entities), [entities])
const previewUrl = queryStr
? `/grapher/thumbnail/${hit.slug}${queryStr}` // TODO extract to .env
: `${BAKED_GRAPHER_URL}/exports/${hit.slug}.svg`

useEffect(() => {
setImgLoaded(false)
setImgError(false)
}, [previewUrl])

return (
<a
href={`${BAKED_GRAPHER_URL}/${hit.slug}`}
href={`${BAKED_GRAPHER_URL}/${hit.slug}${queryStr}`}
data-algolia-index={getIndexName(SearchIndexName.Charts)}
data-algolia-object-id={hit.objectID}
data-algolia-position={hit.__position}
Expand All @@ -134,11 +152,12 @@ function ChartHit({ hit }: { hit: IChartHit }) {
</div>
)}
<img
key={previewUrl}
className={cx({ loaded: imgLoaded, error: imgError })}
loading="lazy"
width={DEFAULT_GRAPHER_WIDTH}
height={DEFAULT_GRAPHER_HEIGHT}
src={`${BAKED_GRAPHER_URL}/exports/${hit.slug}.svg`}
src={previewUrl}
onLoad={() => setImgLoaded(true)}
onError={() => setImgError(true)}
/>
Expand All @@ -153,6 +172,13 @@ function ChartHit({ hit }: { hit: IChartHit }) {
<span className="search-results__chart-hit-variant">
{hit.variantName}
</span>
{entities.length > 0 && (
<ul className="search-results__chart-hit-entities">
{entities.map((entity) => (
<li key={entity}>{entity}</li>
))}
</ul>
)}
</div>
</a>
)
Expand Down
53 changes: 53 additions & 0 deletions site/search/SearchUtils.tsx
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import { HitAttributeHighlightResult } from "instantsearch.js"
import { IChartHit } from "./searchTypes.js"
import { EntityName } from "@ourworldindata/types"
import {
Region,
getRegionByNameOrVariantName,
regions,
countries,
escapeRegExp,
removeTrailingParenthetical,
} from "@ourworldindata/utils"

const allCountryNamesAndVariants = regions.flatMap((c) => [
Expand All @@ -22,3 +27,51 @@ export const extractRegionNamesFromSearchQuery = (query: string) => {
if (regionNames.length === 0) return null
return regionNames.map(getRegionByNameOrVariantName) as Region[]
}

const removeHighlightTags = (text: string) =>
text.replace(/<\/?(mark|strong)>/g, "")

export function pickEntitiesForChartHit(hit: IChartHit): EntityName[] {
const availableEntitiesHighlighted = hit._highlightResult
?.availableEntities as HitAttributeHighlightResult[] | undefined

const pickedEntities = availableEntitiesHighlighted
?.filter((highlightEntry) => {
if (highlightEntry.matchLevel === "none") return false

// Remove any trailing parentheses, e.g. "Africa (UN)" -> "Africa"
const entityNameWithoutTrailingParens = removeTrailingParenthetical(
removeHighlightTags(highlightEntry.value)
)

// The sequence of words that Algolia matched; could be something like ["arab", "united", "republic"]
// which we want to check against the entity name
const matchedSequenceLowerCase = highlightEntry.matchedWords
.join(" ")
.toLowerCase()

// Pick entity if the matched sequence contains the full entity name
if (
matchedSequenceLowerCase.includes(
entityNameWithoutTrailingParens.toLowerCase()
)
)
return true

const country = countries.find(
(c) => c.name === entityNameWithoutTrailingParens
)
if (country?.variantNames) {
// Pick entity if the matched sequence contains any of the variant names
return country.variantNames.some((variant) =>
matchedSequenceLowerCase.includes(variant.toLowerCase())
)
}

return false
})
.map((highlightEntry) => removeHighlightTags(highlightEntry.value))
.sort()

return pickedEntities ?? []
}

0 comments on commit 281c114

Please sign in to comment.