From e41e0a014266374e938817af6b7980249b8434d7 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Tue, 24 Oct 2023 19:38:36 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=90=9B=20filter=20non-editorial=20con?= =?UTF-8?q?tent=20out=20of=20sitemap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/algolia/indexToAlgolia.tsx | 12 +++--- baker/sitemap.ts | 30 +++++++-------- db/wpdb.ts | 22 +---------- .../@ourworldindata/utils/src/owidTypes.ts | 37 +++++++++++++++++++ 4 files changed, 59 insertions(+), 42 deletions(-) diff --git a/baker/algolia/indexToAlgolia.tsx b/baker/algolia/indexToAlgolia.tsx index 1ea7f64fe90..35dd78c5622 100644 --- a/baker/algolia/indexToAlgolia.tsx +++ b/baker/algolia/indexToAlgolia.tsx @@ -11,6 +11,7 @@ import { OwidGdocType, type RawPageview, Tag, + PostRestApi, } from "@ourworldindata/utils" import { formatPost } from "../formatWordpressPost.js" import ReactDOMServer from "react-dom/server.js" @@ -71,7 +72,7 @@ function generateChunksFromHtmlText(htmlString: string) { } async function generateWordpressRecords( - postsApi: wpdb.PostAPI[], + postsApi: PostRestApi[], pageviews: Record ): Promise { const getPostTypeAndImportance = ( @@ -187,11 +188,10 @@ const getPagesRecords = async () => { const pageviews = await Pageview.getViewsByUrlObj() const gdocs = await Gdoc.getPublishedGdocs() const publishedGdocsBySlug = keyBy(gdocs, "slug") - const postsApi = await wpdb - .getPosts() - .then((posts) => - posts.filter((post) => !publishedGdocsBySlug[`/${post.slug}`]) - ) + const postsApi = await wpdb.getPosts( + undefined, + (post) => !publishedGdocsBySlug[`/${post.slug}`] + ) const countryRecords = generateCountryRecords(countries, pageviews) const wordpressRecords = await generateWordpressRecords(postsApi, pageviews) diff --git a/baker/sitemap.ts b/baker/sitemap.ts index e37e6bdbe3b..2a285fe06fe 100644 --- a/baker/sitemap.ts +++ b/baker/sitemap.ts @@ -6,6 +6,7 @@ import { } from "../settings/serverSettings.js" import { dayjs, countries, queryParamsToStr } from "@ourworldindata/utils" import * as db from "../db/db.js" +import * as wpdb from "../db/wpdb.js" import urljoin from "url-join" import { countryProfileSpecs } from "../site/countryProfileProjects.js" import { ExplorerAdminServer } from "../explorerAdminServer/ExplorerAdminServer.js" @@ -57,20 +58,17 @@ const explorerToSitemapUrl = (program: ExplorerProgram): SitemapUrl[] => { } export const makeSitemap = async (explorerAdminServer: ExplorerAdminServer) => { - const posts = (await db - .knexTable("posts_with_gdoc_publish_status") - .where({ status: "publish", isGdocPublished: false }) - .select("slug", "updated_at_in_wordpress")) as { - slug: string - updated_at_in_wordpress: Date - }[] - const gdocPosts = (await db - .knexTable(Gdoc.table) - .where({ published: true }) - .select("slug", "updatedAt")) as { - slug: string - updatedAt: Date - }[] + const alreadyPublishedViaGdocsSlugs = await db.knexRaw(`-- sql + select slug from posts_with_gdoc_publish_status + where isGdocPublished = TRUE`) + const alreadyPublishedViaGdocsSlugsSet = new Set( + alreadyPublishedViaGdocsSlugs.map((row: any) => row.slug) + ) + const postsApi = await wpdb.getPosts( + undefined, + (postrow) => !alreadyPublishedViaGdocsSlugsSet.has(postrow.slug) + ) + const gdocPosts = await Gdoc.getPublishedGdocs() const charts = (await db .knexTable(Chart.table) .select(db.knexRaw(`updatedAt, config->>"$.slug" AS slug`)) @@ -94,9 +92,9 @@ export const makeSitemap = async (explorerAdminServer: ExplorerAdminServer) => { }) ) .concat( - posts.map((p) => ({ + postsApi.map((p) => ({ loc: urljoin(BAKED_BASE_URL, p.slug), - lastmod: dayjs(p.updated_at_in_wordpress).format("YYYY-MM-DD"), + lastmod: dayjs(p.modified_gmt).format("YYYY-MM-DD"), })) ) .concat( diff --git a/db/wpdb.ts b/db/wpdb.ts index edd2ea53b04..fe8cc90f812 100644 --- a/db/wpdb.ts +++ b/db/wpdb.ts @@ -476,7 +476,7 @@ export const getPosts = async ( postTypes: string[] = [WP_PostType.Post, WP_PostType.Page], filterFunc?: FilterFnPostRestApi, limit?: number -): Promise => { +): Promise => { if (!isWordpressAPIEnabled) return [] const perPage = 20 @@ -720,27 +720,9 @@ export const getBlockContent = async ( return post.data?.wpBlock?.content ?? undefined } -export interface PostAPI { - id: number - type: WP_PostType - slug: string - title: { - rendered: string - } - date_gmt: string - modified_gmt: string - authors_name?: string[] - content: { rendered: string } - excerpt: { rendered: string } - featured_media_paths: { - medium_large: string - thumbnail: string - } - featured_media: number -} export const getFullPost = async ( - postApi: PostAPI, + postApi: PostRestApi, excludeContent?: boolean ): Promise => ({ id: postApi.id, diff --git a/packages/@ourworldindata/utils/src/owidTypes.ts b/packages/@ourworldindata/utils/src/owidTypes.ts index 6e4889d13fb..707341022cc 100644 --- a/packages/@ourworldindata/utils/src/owidTypes.ts +++ b/packages/@ourworldindata/utils/src/owidTypes.ts @@ -324,6 +324,43 @@ export interface PostRestApi { latest?: boolean } } + id: number + date: string + date_gmt: string + guid: { + rendered: string + } + modified: string + modified_gmt: string + + status: string + type: WP_PostType + link: string + title: { + rendered: string + } + content: { + rendered: string + protected: boolean + } + excerpt: { + rendered: string + protected: boolean + } + author: number + featured_media: number + comment_status: string + ping_status: string + sticky: boolean + template: string + format: string + categories: number[] + tags: any[] + authors_name: string[] + featured_media_paths: { + thumbnail: string + medium_large: string + } } export interface KeyInsight { From 5a16038bfdf2ba831a6e1d24567dd3221062d30f Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Thu, 26 Oct 2023 15:47:01 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=90=9B=20fix=20baking-with-gdocs-succ?= =?UTF-8?q?essor=20filtering=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/SiteBaker.tsx | 12 ++---------- baker/sitemap.ts | 8 ++------ db/db.ts | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/baker/SiteBaker.tsx b/baker/SiteBaker.tsx index 9fb72acd358..5dcf3c4bd64 100644 --- a/baker/SiteBaker.tsx +++ b/baker/SiteBaker.tsx @@ -288,16 +288,8 @@ export class SiteBaker { private async bakePosts() { if (!this.bakeSteps.has("wordpressPosts")) return - // In the backporting workflow, the users create gdoc posts for posts. As long as these are not yet published, - // we still want to bake them from the WP posts. Once the users presses publish there though, we want to stop - // baking them from the wordpress post. Here we fetch all the slugs of posts that have been published via gdocs - // and exclude them from the baking process. - const alreadyPublishedViaGdocsSlugs = await db.knexRaw(`-- sql - select slug from posts_with_gdoc_publish_status - where isGdocPublished = TRUE`) - const alreadyPublishedViaGdocsSlugsSet = new Set( - alreadyPublishedViaGdocsSlugs.map((row: any) => row.slug) - ) + const alreadyPublishedViaGdocsSlugsSet = + await db.getSlugsWithPublishedGdocsSuccessors() const postsApi = await wpdb.getPosts( undefined, diff --git a/baker/sitemap.ts b/baker/sitemap.ts index 2a285fe06fe..578cc7095b1 100644 --- a/baker/sitemap.ts +++ b/baker/sitemap.ts @@ -58,12 +58,8 @@ const explorerToSitemapUrl = (program: ExplorerProgram): SitemapUrl[] => { } export const makeSitemap = async (explorerAdminServer: ExplorerAdminServer) => { - const alreadyPublishedViaGdocsSlugs = await db.knexRaw(`-- sql - select slug from posts_with_gdoc_publish_status - where isGdocPublished = TRUE`) - const alreadyPublishedViaGdocsSlugsSet = new Set( - alreadyPublishedViaGdocsSlugs.map((row: any) => row.slug) - ) + const alreadyPublishedViaGdocsSlugsSet = + await db.getSlugsWithPublishedGdocsSuccessors() const postsApi = await wpdb.getPosts( undefined, (postrow) => !alreadyPublishedViaGdocsSlugsSet.has(postrow.slug) diff --git a/db/db.ts b/db/db.ts index a301bb2fc58..fb04c286fdd 100644 --- a/db/db.ts +++ b/db/db.ts @@ -109,3 +109,21 @@ export const knexTable = (table: string): Knex.QueryBuilder => knexInstance().table(table) export const knexRaw = (str: string): Knex.Raw => knexInstance().raw(str) + +/** + * In the backporting workflow, the users create gdoc posts for posts. As long as these are not yet published, + * we still want to bake them from the WP posts. Once the users presses publish there though, we want to stop + * baking them from the wordpress post. This funciton fetches all the slugs of posts that have been published via gdocs, + * to help us exclude them from the baking process. + */ +export const getSlugsWithPublishedGdocsSuccessors = async (): Promise< + Set +> => { + return knexRaw( + `-- sql + select slug from posts_with_gdoc_publish_status + where isGdocPublished = TRUE` + ) + .then((res) => res[0]) + .then((rows) => new Set(rows.map((row: any) => row.slug))) +} From 32ba1a8acabab0562534da97bd7f6e60af9cb859 Mon Sep 17 00:00:00 2001 From: Ike Saunders Date: Mon, 30 Oct 2023 14:41:47 -0400 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=90=9B=20fix=20wp-posts-with-successo?= =?UTF-8?q?rs=20getting=20indexed=20in=20algolia?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- baker/algolia/indexToAlgolia.tsx | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/baker/algolia/indexToAlgolia.tsx b/baker/algolia/indexToAlgolia.tsx index 35dd78c5622..472117ba0a4 100644 --- a/baker/algolia/indexToAlgolia.tsx +++ b/baker/algolia/indexToAlgolia.tsx @@ -188,10 +188,18 @@ const getPagesRecords = async () => { const pageviews = await Pageview.getViewsByUrlObj() const gdocs = await Gdoc.getPublishedGdocs() const publishedGdocsBySlug = keyBy(gdocs, "slug") - const postsApi = await wpdb.getPosts( - undefined, - (post) => !publishedGdocsBySlug[`/${post.slug}`] - ) + const slugsWithPublishedGdocsSuccessors = + await db.getSlugsWithPublishedGdocsSuccessors() + const postsApi = await wpdb.getPosts(undefined, (post) => { + // Two things can happen here: + // 1. There's a published Gdoc with the same slug + // 2. This post has a Gdoc successor (which might have a different slug) + // In either case, we don't want to index this WP post + return !( + publishedGdocsBySlug[post.slug] || + slugsWithPublishedGdocsSuccessors.has(post.slug) + ) + }) const countryRecords = generateCountryRecords(countries, pageviews) const wordpressRecords = await generateWordpressRecords(postsApi, pageviews)