From ab8251bf016204eb12451f42de5da80b66e64321 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Mon, 16 Dec 2024 10:03:17 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=89=20automatic=20breadcrumbs=20from?= =?UTF-8?q?=20the=20tag=20graph?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- adminSiteServer/app.test.tsx | 282 +++++++++++++++++- baker/SiteBaker.tsx | 13 + baker/algolia/utils/charts.ts | 12 +- baker/algolia/utils/explorerViews.ts | 15 +- db/db.ts | 89 ++++-- db/model/Gdoc/GdocFactory.ts | 20 ++ db/tests/testHelpers.ts | 4 + .../types/src/dbTypes/TagsGraph.ts | 8 - .../types/src/domainTypes/ContentGraph.ts | 2 + packages/@ourworldindata/types/src/index.ts | 1 + packages/@ourworldindata/utils/src/Util.ts | 14 + site/Breadcrumb/Breadcrumb.tsx | 15 +- 12 files changed, 425 insertions(+), 50 deletions(-) delete mode 100644 packages/@ourworldindata/types/src/dbTypes/TagsGraph.ts diff --git a/adminSiteServer/app.test.tsx b/adminSiteServer/app.test.tsx index 120673d8706..e4e3ea418ca 100644 --- a/adminSiteServer/app.test.tsx +++ b/adminSiteServer/app.test.tsx @@ -1,5 +1,5 @@ import { google } from "googleapis" -import { jest } from "@jest/globals" +import { beforeAll, jest } from "@jest/globals" // Mock the google docs api to retrieve files from the test-files directory // AFAICT, we have to do this directly after the import // and before any other code that might import googleapis @@ -43,7 +43,10 @@ import knex, { Knex } from "knex" import { dbTestConfig } from "../db/tests/dbTestConfig.js" import { TransactionCloseMode, + getBestBreadcrumbs, + getParentTagArraysByChildName, knexReadWriteTransaction, + knexReadonlyTransaction, setKnexInstance, } from "../db/db.js" import { cleanTestDb, TABLES_IN_USE } from "../db/tests/testHelpers.js" @@ -51,9 +54,19 @@ import { ChartConfigsTableName, ChartsTableName, DatasetsTableName, + DbInsertTag, + DbInsertTagGraphNode, MultiDimDataPagesTableName, MultiDimXChartConfigsTableName, + TagsTableName, + TagGraphTableName, VariablesTableName, + TagGraphRootName, + PostsGdocsTableName, + OwidGdocType, + DbInsertPostGdoc, + DbInsertPostGdocXTag, + PostsGdocsXTagsTableName, } from "@ourworldindata/types" import path from "path" import fs from "fs" @@ -920,3 +933,270 @@ describe("OwidAdminApp: indicator-level chart configs", () => { expect(json.success).toBe(false) }) }) + +describe.only("OwidAdminApp: tag graph", () => { + // prettier-ignore + const dummyTags: DbInsertTag[] = [ + { name: TagGraphRootName, id: 1 }, + { name: "Energy and Environment", id: 2 }, + { name: "Energy", slug: "energy", id: 3 }, + { name: "Nuclear Energy", slug: "nuclear-energy", id: 4 }, + { name: "CO2 & Greenhouse Gas Emissions", slug: "co2-and-greenhouse-gas-emissions", id: 5 }, + ] + + const dummyTagGraph: DbInsertTagGraphNode[] = [ + { parentId: 1, childId: 2 }, + { parentId: 2, childId: 3, weight: 110 }, + { parentId: 2, childId: 5 }, + { parentId: 3, childId: 4 }, + { parentId: 5, childId: 4 }, + ] + + function makeDummyTopicPage(slug: string): DbInsertPostGdoc { + return { + slug, + content: JSON.stringify({ + type: OwidGdocType.TopicPage, + authors: [] as string[], + }), + id: slug, + published: 1, + createdAt: new Date(), + publishedAt: new Date(), + markdown: "", + } + } + const dummyTopicPages: DbInsertPostGdoc[] = [ + makeDummyTopicPage("energy"), + makeDummyTopicPage("nuclear-energy"), + makeDummyTopicPage("co2-and-greenhouse-gas-emissions"), + ] + + const dummyPostTags: DbInsertPostGdocXTag[] = [ + { gdocId: "energy", tagId: 3 }, + { gdocId: "nuclear-energy", tagId: 4 }, + { gdocId: "co2-and-greenhouse-gas-emissions", tagId: 5 }, + ] + + beforeEach(async () => { + await testKnexInstance!(TagsTableName).insert(dummyTags) + await testKnexInstance!(TagGraphTableName).insert(dummyTagGraph) + await testKnexInstance!(PostsGdocsTableName).insert(dummyTopicPages) + await testKnexInstance!(PostsGdocsXTagsTableName).insert(dummyPostTags) + }) + it("should be able to see all the tags", async () => { + const tags = await fetchJsonFromAdminApi("/tags.json") + expect(tags).toEqual({ + tags: [ + { + id: 5, + isTopic: 1, + name: "CO2 & Greenhouse Gas Emissions", + slug: "co2-and-greenhouse-gas-emissions", + }, + { + id: 3, + isTopic: 1, + name: "Energy", + slug: "energy", + }, + { + id: 2, + isTopic: 0, + name: "Energy and Environment", + slug: null, + }, + { + id: 4, + isTopic: 1, + name: "Nuclear Energy", + slug: "nuclear-energy", + }, + { + id: 1, + isTopic: 0, + name: "tag-graph-root", + slug: null, + }, + ], + }) + }) + + it("should be able to generate a tag graph", async () => { + const json = await fetchJsonFromAdminApi("/flatTagGraph.json") + expect(json).toEqual({ + "1": [ + { + childId: 2, + isTopic: 0, + name: "Energy and Environment", + parentId: 1, + weight: 100, + }, + ], + "2": [ + { + childId: 3, + isTopic: 1, + name: "Energy", + parentId: 2, + weight: 110, + }, + { + childId: 5, + isTopic: 1, + name: "CO2 & Greenhouse Gas Emissions", + parentId: 2, + weight: 100, + }, + ], + "3": [ + { + childId: 4, + isTopic: 1, + name: "Nuclear Energy", + parentId: 3, + weight: 100, + }, + ], + "5": [ + { + childId: 4, + isTopic: 1, + name: "Nuclear Energy", + parentId: 5, + weight: 100, + }, + ], + __rootId: 1, + }) + }) + + it("should be able to generate a set of breadcrumbs for a tag", async () => { + await knexReadonlyTransaction( + async (trx) => { + const parentTagArraysByChildName = + await getParentTagArraysByChildName(trx) + const breadcrumbs = getBestBreadcrumbs( + [ + { + id: 4, + name: "Nuclear Energy", + slug: "nuclear-energy", + }, + ], + parentTagArraysByChildName + ) + // breadcrumb hrefs are env-dependent, so we just assert on the labels + const labelsOnly = breadcrumbs.map((b) => b.label) + expect(labelsOnly).toEqual(["Energy", "Nuclear Energy"]) + }, + TransactionCloseMode.KeepOpen, + testKnexInstance + ) + }) + + it("should generate an optimal set of breadcrumbs when given multiple tags", async () => { + await knexReadonlyTransaction( + async (trx) => { + const parentTagArraysByChildName = + await getParentTagArraysByChildName(trx) + const breadcrumbs = getBestBreadcrumbs( + [ + { + id: 4, + name: "Nuclear Energy", + slug: "nuclear-energy", + }, + { + id: 5, + name: "CO2 & Greenhouse Gas Emissions", + slug: "co2-and-greenhouse-gas-emissions", + }, + ], + parentTagArraysByChildName + ) + // breadcrumb hrefs are env-dependent, so we just assert on the labels + const labelsOnly = breadcrumbs.map((b) => b.label) + expect(labelsOnly).toEqual(["Energy", "Nuclear Energy"]) + }, + TransactionCloseMode.KeepOpen, + testKnexInstance + ) + }) + it("should return an empty array when there are no topic tags in any of the tags' ancestors", async () => { + await knexReadonlyTransaction( + async (trx) => { + const parentTagArraysByChildName = + await getParentTagArraysByChildName(trx) + const breadcrumbs = getBestBreadcrumbs( + [ + { + id: 2, + name: "Energy and Environment", + slug: "", + }, + ], + parentTagArraysByChildName + ) + // breadcrumb hrefs are env-dependent, so we just assert on the labels + const labelsOnly = breadcrumbs.map((b) => b.label) + expect(labelsOnly).toEqual([]) + }, + TransactionCloseMode.KeepOpen, + testKnexInstance + ) + }) + it("when there are two valid paths to a given tag, it selects the longest one", async () => { + await knexReadonlyTransaction( + async (trx) => { + // Here, Women's Employment has 2 paths: + // 1. Poverty and Economic Development > Women's Employment + // 2. Human Rights > Women's Rights > Women's Employment + // prettier-ignore + await testKnexInstance!(TagsTableName).insert([ + { name: "Human Rights", id: 6 }, + { name: "Women's Rights", slug: "womens-rights", id: 7 }, + { name: "Women's Employment", slug: "womens-employment", id: 8 }, + { name: "Poverty and Economic Development", id: 9 }, + ]) + await testKnexInstance!(TagGraphTableName).insert([ + { parentId: 1, childId: 6 }, + { parentId: 6, childId: 7 }, + { parentId: 7, childId: 8 }, + { parentId: 1, childId: 9 }, + { parentId: 9, childId: 8 }, + ]) + await testKnexInstance!(PostsGdocsTableName).insert([ + makeDummyTopicPage("womens-rights"), + makeDummyTopicPage("womens-employment"), + ]) + await testKnexInstance!(PostsGdocsXTagsTableName).insert([ + { gdocId: "womens-rights", tagId: 7 }, + { gdocId: "womens-employment", tagId: 8 }, + ]) + + const parentTagArraysByChildName = + await getParentTagArraysByChildName(trx) + const breadcrumbs = getBestBreadcrumbs( + [ + { + id: 8, + name: "Women's Employment", + slug: "womens-employment", + }, + ], + parentTagArraysByChildName + ) + // breadcrumb hrefs are env-dependent, so we just assert on the labels + const labelsOnly = breadcrumbs.map((b) => b.label) + expect(labelsOnly).toEqual([ + "Women's Rights", + "Women's Employment", + ]) + }, + TransactionCloseMode.KeepOpen, + testKnexInstance + ) + }) +}) diff --git a/baker/SiteBaker.tsx b/baker/SiteBaker.tsx index c422adcaa4e..8a565eb5789 100644 --- a/baker/SiteBaker.tsx +++ b/baker/SiteBaker.tsx @@ -597,6 +597,9 @@ export class SiteBaker { .getPublishedGdocPostsWithTags(knex) .then((gdocs) => gdocs.map(gdocFromJSON)) + const allParentTagArraysByChildName = + await db.getParentTagArraysByChildName(knex) + const gdocsToBake = slugs !== undefined ? publishedGdocs.filter((gdoc) => slugs.includes(gdoc.slug)) @@ -630,6 +633,16 @@ export class SiteBaker { } publishedGdoc.linkedIndicators = attachments.linkedIndicators + if ( + !publishedGdoc.breadcrumbs?.length && + publishedGdoc.tags?.length + ) { + publishedGdoc.breadcrumbs = db.getBestBreadcrumbs( + publishedGdoc.tags, + allParentTagArraysByChildName + ) + } + // this is a no-op if the gdoc doesn't have an all-chart block if ("loadRelatedCharts" in publishedGdoc) { await publishedGdoc.loadRelatedCharts(knex) diff --git a/baker/algolia/utils/charts.ts b/baker/algolia/utils/charts.ts index 51f36eb0280..ac74e153536 100644 --- a/baker/algolia/utils/charts.ts +++ b/baker/algolia/utils/charts.ts @@ -13,6 +13,7 @@ import { isPathRedirectedToExplorer } from "../../../explorerAdminServer/Explore import { ParsedChartRecordRow, RawChartRecordRow } from "./types.js" import { excludeNullish } from "@ourworldindata/utils" import { processAvailableEntities } from "./shared.js" +import { getUniqueNamesFromParentTagArrays } from "@ourworldindata/utils/dist/Util.js" const computeChartScore = (record: Omit): number => { const { numRelatedArticles, views_7d } = record @@ -99,7 +100,8 @@ export const getChartsRecords = async ( const pageviews = await getAnalyticsPageviewsByUrlObj(knex) - const parentTagsByChildName = await db.getParentTagsByChildName(knex) + const parentTagArraysByChildName = + await db.getParentTagArraysByChildName(knex) const records: ChartRecord[] = [] for (const c of parsedRows) { @@ -121,10 +123,12 @@ export const getChartsRecords = async ( fontSize: 10, // doesn't matter, but is a mandatory field }).plaintext - const parentTags = c.tags.flatMap( + const parentTags = c.tags.flatMap((tagName) => { + const parentTagArrays = parentTagArraysByChildName[tagName] // a chart can be tagged with a tag that isn't in the tag graph - (tag) => parentTagsByChildName[tag] || [] - ) + if (!parentTagArrays) return [] + return getUniqueNamesFromParentTagArrays(parentTagArrays) + }) const record = { objectID: c.id.toString(), diff --git a/baker/algolia/utils/explorerViews.ts b/baker/algolia/utils/explorerViews.ts index 84c2037fd4f..19b059688cc 100644 --- a/baker/algolia/utils/explorerViews.ts +++ b/baker/algolia/utils/explorerViews.ts @@ -49,6 +49,7 @@ import { ChartRecord, ChartRecordType, } from "../../../site/search/searchTypes.js" +import { getUniqueNamesFromParentTagArrays } from "@ourworldindata/utils/dist/Util.js" export function explorerViewRecordToChartRecord( e: ExplorerViewFinalRecord @@ -698,7 +699,7 @@ async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { // The DB query gets the tags for the explorer, but we need to add the parent tags as well. // This isn't done in the query because it would require a recursive CTE. // It's easier to write that query once, separately, and reuse it. - const parentTags = await db.getParentTagsByChildName(trx) + const parentTagArrays = await db.getParentTagArraysByChildName(trx) const publishedExplorersWithTags = [] for (const explorer of Object.values(explorersBySlug)) { @@ -709,10 +710,14 @@ async function getExplorersWithInheritedTags(trx: db.KnexReadonlyTransaction) { }) } const tags = new Set() - for (const tag of explorer.tags) { - tags.add(tag) - for (const parentTag of parentTags[tag]) { - tags.add(parentTag) + + for (const tagName of explorer.tags) { + tags.add(tagName) + const parentTagNames = getUniqueNamesFromParentTagArrays( + parentTagArrays[tagName] + ) + for (const parentTagName of parentTagNames) { + tags.add(parentTagName) } } diff --git a/db/db.ts b/db/db.ts index ab9aa2e1ada..552a41c796c 100644 --- a/db/db.ts +++ b/db/db.ts @@ -5,6 +5,7 @@ import { GRAPHER_DB_PASS, GRAPHER_DB_NAME, GRAPHER_DB_PORT, + BAKED_BASE_URL, } from "../settings/serverSettings.js" import { registerExitHandler } from "./cleanup.js" import { createTagGraph, keyBy } from "@ourworldindata/utils" @@ -30,8 +31,10 @@ import { MinimalExplorerInfo, DbEnrichedImage, DbEnrichedImageWithUserId, + MinimalTag, + BreadcrumbItem, } from "@ourworldindata/types" -import { groupBy, uniq } from "lodash" +import { groupBy } from "lodash" import { gdocFromJSON } from "./model/Gdoc/GdocFactory.js" // Return the first match from a mysql query @@ -536,43 +539,81 @@ export async function getFlatTagGraph(knex: KnexReadonlyTransaction): Promise< return { ...tagGraphByParentId, __rootId: tagGraphRootIdResult.id } } -// DFS through the tag graph and create a map of parent tags for each child tag -// e.g. { "Child": [ "Parent", "Grandparent" ], "Parent": [ "Grandparent" ] } -// parent tags are listed in no particular order -export async function getParentTagsByChildName( +// DFS through the tag graph and track all paths from a child to the root +// e.g. { "childTag": [ [parentTag1, parentTag2], [parentTag3] ] } +// Use this with getUniqueNamesFromParentTagArrays to get Record instead +export async function getParentTagArraysByChildName( trx: KnexReadonlyTransaction -): Promise> { +): Promise> { const { __rootId, ...flatTagGraph } = await getFlatTagGraph(trx) const tagGraph = createTagGraph(flatTagGraph, __rootId) - const tagsById = await trx("tags") - .select("id", "name") + .select("id", "name", "slug") .then((tags) => keyBy(tags, "id")) - const parentTagsByChildName: Record< - DbPlainTag["name"], - DbPlainTag["name"][] - > = {} + const pathsByChildName: Record = {} + + function trackAllPaths( + node: TagGraphNode, + currentPath: DbPlainTag[] = [] + ): void { + const currentTag = tagsById[node.id] + const newPath = [...currentPath, currentTag] + + // Don't add paths for root node + if (node.id !== __rootId) { + const nodeName = currentTag.name + if (!pathsByChildName[nodeName]) { + pathsByChildName[nodeName] = [] + } + + // Add the complete path (excluding root) + pathsByChildName[nodeName].push(newPath.slice(1)) + } - function trackParents(node: TagGraphNode): void { for (const child of node.children) { - trackParents(child) + trackAllPaths(child, newPath) } + } - const preexistingParents = parentTagsByChildName[node.name] ?? [] - // node.path is an array of tag ids from the root to the current node - // slice to remove the root node and the current node, then map them into tag names - const newParents = node.path.slice(1, -1).map((id) => tagsById[id].name) + trackAllPaths(tagGraph) - parentTagsByChildName[node.name] = uniq([ - ...preexistingParents, - ...newParents, - ]) + return pathsByChildName +} + +export function getBestBreadcrumbs( + tags: MinimalTag[], + parentTagArraysByChildName: Record +): BreadcrumbItem[] { + // For each tag, find the best path according to our criteria + // e.g. { "Nuclear Energy ": ["Energy and Environment", "Energy"], "Air Pollution": ["Energy and Environment"] } + const result = new Map() + + for (const tag of tags) { + const paths = parentTagArraysByChildName[tag.name] + if (paths && paths.length > 0) { + // Since getFlatTagGraph already orders by weight DESC and name ASC, + // the first path in the array will be our best path + result.set(tag.id, paths[0]) + } } - trackParents(tagGraph) + // Only keep the topics in the paths, because only topics are clickable as breadcrumbs + const topicsOnly = Array.from(result.values()).reduce((acc, path) => { + return [...acc, path.filter((tag) => tag.slug)] + }, [] as DbPlainTag[][]) + + // Pick the longest path from result, assuming that the longest path is the best + const longestPath = topicsOnly.reduce((best, path) => { + return path.length > best.length ? path : best + }, []) + + const breadcrumbs = longestPath.map((tag) => ({ + label: tag.name, + href: `${BAKED_BASE_URL}/${tag.slug}`, + })) - return parentTagsByChildName + return breadcrumbs } export async function updateTagGraph( diff --git a/db/model/Gdoc/GdocFactory.ts b/db/model/Gdoc/GdocFactory.ts index 0a192d45dbe..4e8f9f8140a 100644 --- a/db/model/Gdoc/GdocFactory.ts +++ b/db/model/Gdoc/GdocFactory.ts @@ -44,6 +44,8 @@ import { KnexReadWriteTransaction, getImageMetadataByFilenames, getPublishedGdocPostsWithTags, + getParentTagArraysByChildName, + getBestBreadcrumbs, } from "../../db.js" import { enrichedBlocksToMarkdown } from "./enrichedToMarkdown.js" import { GdocAbout } from "./GdocAbout.js" @@ -204,7 +206,17 @@ export async function getGdocBaseObjectById( [id] ) gdoc.tags = tags + + if (!gdoc.breadcrumbs?.length && tags.length) { + const parentTagArraysByChildName = + await getParentTagArraysByChildName(knex) + gdoc.breadcrumbs = getBestBreadcrumbs( + gdoc.tags, + parentTagArraysByChildName + ) + } } + return gdoc } @@ -292,6 +304,14 @@ export async function getPublishedGdocBaseObjectBySlug( [gdoc.id] ) gdoc.tags = tags + if (!gdoc.breadcrumbs?.length && tags.length) { + const parentTagArraysByChildName = + await getParentTagArraysByChildName(knex) + gdoc.breadcrumbs = getBestBreadcrumbs( + gdoc.tags, + parentTagArraysByChildName + ) + } } return gdoc } diff --git a/db/tests/testHelpers.ts b/db/tests/testHelpers.ts index 47e3f231e55..8340b59bf75 100644 --- a/db/tests/testHelpers.ts +++ b/db/tests/testHelpers.ts @@ -7,6 +7,8 @@ import { MultiDimDataPagesTableName, MultiDimXChartConfigsTableName, PostsGdocsTableName, + TagGraphTableName, + TagsTableName, UsersTableName, VariablesTableName, } from "@ourworldindata/types" @@ -24,6 +26,8 @@ export const TABLES_IN_USE = [ DatasetsTableName, PostsGdocsTableName, UsersTableName, + TagGraphTableName, + TagsTableName, ] export async function cleanTestDb( diff --git a/packages/@ourworldindata/types/src/dbTypes/TagsGraph.ts b/packages/@ourworldindata/types/src/dbTypes/TagsGraph.ts deleted file mode 100644 index 3e9aa929bd0..00000000000 --- a/packages/@ourworldindata/types/src/dbTypes/TagsGraph.ts +++ /dev/null @@ -1,8 +0,0 @@ -/** the entity in the `tags` table */ -export const TagsGraphTableName = "tag_graph" -export interface DbInsertTagGraph { - parentId: number - childId: number - weight?: number -} -export type DbPlainTag = Required diff --git a/packages/@ourworldindata/types/src/domainTypes/ContentGraph.ts b/packages/@ourworldindata/types/src/domainTypes/ContentGraph.ts index 34cf5a8fd8c..29c3d5a3bf8 100644 --- a/packages/@ourworldindata/types/src/domainTypes/ContentGraph.ts +++ b/packages/@ourworldindata/types/src/domainTypes/ContentGraph.ts @@ -12,6 +12,8 @@ export interface CategoryWithEntries { subcategories?: CategoryWithEntries[] } +export const TagGraphTableName = "tag_graph" + export type DbInsertTagGraphNode = { parentId: number childId: number diff --git a/packages/@ourworldindata/types/src/index.ts b/packages/@ourworldindata/types/src/index.ts index f5fd8971661..568d771f0fd 100644 --- a/packages/@ourworldindata/types/src/index.ts +++ b/packages/@ourworldindata/types/src/index.ts @@ -127,6 +127,7 @@ export { export { TagGraphRootName, + TagGraphTableName, type CategoryWithEntries, type EntryMeta, type FlatTagGraph, diff --git a/packages/@ourworldindata/utils/src/Util.ts b/packages/@ourworldindata/utils/src/Util.ts index 9908fd28753..d80a8f250a5 100644 --- a/packages/@ourworldindata/utils/src/Util.ts +++ b/packages/@ourworldindata/utils/src/Util.ts @@ -177,6 +177,7 @@ import { GrapherInterface, DimensionProperty, GRAPHER_CHART_TYPES, + DbPlainTag, } from "@ourworldindata/types" import { PointVector } from "./PointVector.js" import React from "react" @@ -1926,6 +1927,19 @@ export function isFiniteWithGuard(value: unknown): value is number { return isFinite(value as any) } +// Use with getParentTagArraysByChildName to collapse all paths to the child into a single array of unique parent tag names +export function getUniqueNamesFromParentTagArrays( + parentTagArrays: DbPlainTag[][] +): string[] { + const tagNames = new Set( + parentTagArrays.flatMap((parentTagArray) => + parentTagArray.map((tag) => tag.name) + ) + ) + + return [...tagNames] +} + export function createTagGraph( tagGraphByParentId: Record, rootId: number diff --git a/site/Breadcrumb/Breadcrumb.tsx b/site/Breadcrumb/Breadcrumb.tsx index 07791a48301..1efaa58cbd8 100644 --- a/site/Breadcrumb/Breadcrumb.tsx +++ b/site/Breadcrumb/Breadcrumb.tsx @@ -70,14 +70,13 @@ export const Breadcrumbs = ({ {items.map((item, idx) => { const isLast = idx === items.length - 1 - const breadcrumb = - !isLast && item.href ? ( - - {item.label} - - ) : ( - {item.label} - ) + const breadcrumb = item.href ? ( + + {item.label} + + ) : ( + {item.label} + ) return (