From b940bbcc72d09678e9f412ef6a5a1be1352fb56d Mon Sep 17 00:00:00 2001 From: Matthieu Bergel Date: Thu, 8 Feb 2024 20:24:53 +0000 Subject: [PATCH] refactor: bake embeds from content graph --- baker/SiteBaker.tsx | 26 +++++++-------------- db/model/Chart.ts | 56 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 19 deletions(-) diff --git a/baker/SiteBaker.tsx b/baker/SiteBaker.tsx index ffe8a633074..e98fcf8744f 100644 --- a/baker/SiteBaker.tsx +++ b/baker/SiteBaker.tsx @@ -2,7 +2,6 @@ import fs from "fs-extra" import path from "path" import { glob } from "glob" import { keyBy, without, uniq, mapValues, pick } from "lodash" -import cheerio from "cheerio" import ProgressBar from "progress" import * as wpdb from "../db/wpdb.js" import * as db from "../db/db.js" @@ -86,7 +85,10 @@ import { GdocPost } from "../db/model/Gdoc/GdocPost.js" import { Image } from "../db/model/Image.js" import { generateEmbedSnippet } from "../site/viteUtils.js" import { logErrorAndMaybeSendToBugsnag } from "../serverUtils/errorLog.js" -import { Chart } from "../db/model/Chart.js" +import { + Chart, + getChartEmbedUrlsInPublishedWordpressPosts, +} from "../db/model/Chart.js" import { BAKED_BASE_URL, BAKED_GRAPHER_EXPORTS_BASE_URL, @@ -185,23 +187,11 @@ export class SiteBaker { private async bakeEmbeds() { if (!this.bakeSteps.has("embeds")) return - // Find all grapher urls used as embeds in all posts on the site - const rows = await wpdb.singleton.query( - `SELECT post_content FROM wp_posts WHERE (post_type='page' OR post_type='post' OR post_type='wp_block') AND post_status='publish'` + + // Find all grapher urls used as embeds in all Wordpress posts on the site + const grapherUrls = uniq( + await getChartEmbedUrlsInPublishedWordpressPosts() ) - let grapherUrls = [] - for (const row of rows) { - const $ = cheerio.load(row.post_content) - grapherUrls.push( - ...$("iframe") - .toArray() - .filter((el) => - (el.attribs["src"] || "").match(/\/grapher\//) - ) - .map((el) => el.attribs["src"].trim()) - ) - } - grapherUrls = uniq(grapherUrls) await bakeGrapherUrls(grapherUrls) diff --git a/db/model/Chart.ts b/db/model/Chart.ts index 58ac3783fa3..6cd6c73e441 100644 --- a/db/model/Chart.ts +++ b/db/model/Chart.ts @@ -25,7 +25,10 @@ import { RelatedChart, } from "@ourworldindata/types" import { OpenAI } from "openai" -import { OPENAI_API_KEY } from "../../settings/serverSettings.js" +import { + BAKED_BASE_URL, + OPENAI_API_KEY, +} from "../../settings/serverSettings.js" // XXX hardcoded filtering to public parent tags export const PUBLIC_TAG_PARENT_IDS = [ @@ -379,3 +382,54 @@ export const getRelatedChartsForVariable = async ( ORDER BY title ASC `) } + +export const getChartEmbedUrlsInPublishedWordpressPosts = async (): Promise< + string[] +> => { + const chartSlugQueryString: { target: string; queryString: string }[] = ( + await db.knexInstance().raw( + ` + SELECT + pl.target, + pl.queryString + FROM + posts_links pl + JOIN posts p ON p.id = pl.sourceId + WHERE + pl.linkType = "grapher" + AND pl.componentType = "src" + AND p.status = "publish" + AND p.type != 'wp_block' + AND p.slug NOT IN ( + -- We want to exclude the slugs of published gdocs, since they override the Wordpress posts + -- published under the same slugs. + SELECT + slug from posts_gdocs pg + WHERE + pg.slug = p.slug + AND pg.content ->> '$.type' <> 'fragment' + AND pg.published = 1 + ) + -- Commenting this out since we currently don't do anything with the baked embeds in gdocs posts + -- see https://github.com/owid/owid-grapher/issues/2992#issuecomment-1934690219 + -- Rename to getChartEmbedUrlsInPublishedPosts if we decide to use this + -- UNION + -- SELECT + -- pgl.target, + -- pgl.queryString + -- FROM + -- posts_gdocs_links pgl + -- JOIN posts_gdocs pg on pg.id = pgl.sourceId + -- WHERE + -- pgl.linkType = "grapher" + -- AND pgl.componentType = "chart" + -- AND pg.content ->> '$.type' <> 'fragment' + -- AND pg.published = 1 + ` + ) + )[0] + + return chartSlugQueryString.map((row) => { + return `${BAKED_BASE_URL}/${row.target}${row.queryString}` + }) +}