Skip to content

Commit

Permalink
Merge pull request #2846 from owid/sitemap-filtering
Browse files Browse the repository at this point in the history
🐛 filter non-editorial content out of sitemap
  • Loading branch information
ikesau authored Nov 1, 2023
2 parents 6ba1911 + 32ba1a8 commit f3d12b0
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 51 deletions.
12 changes: 2 additions & 10 deletions baker/SiteBaker.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -288,16 +288,8 @@ export class SiteBaker {

private async bakePosts() {
if (!this.bakeSteps.has("wordpressPosts")) return
// In the backporting workflow, the users create gdoc posts for posts. As long as these are not yet published,
// we still want to bake them from the WP posts. Once the users presses publish there though, we want to stop
// baking them from the wordpress post. Here we fetch all the slugs of posts that have been published via gdocs
// and exclude them from the baking process.
const alreadyPublishedViaGdocsSlugs = await db.knexRaw(`-- sql
select slug from posts_with_gdoc_publish_status
where isGdocPublished = TRUE`)
const alreadyPublishedViaGdocsSlugsSet = new Set(
alreadyPublishedViaGdocsSlugs.map((row: any) => row.slug)
)
const alreadyPublishedViaGdocsSlugsSet =
await db.getSlugsWithPublishedGdocsSuccessors()

const postsApi = await wpdb.getPosts(
undefined,
Expand Down
18 changes: 13 additions & 5 deletions baker/algolia/indexToAlgolia.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
OwidGdocType,
type RawPageview,
Tag,
PostRestApi,
} from "@ourworldindata/utils"
import { formatPost } from "../formatWordpressPost.js"
import ReactDOMServer from "react-dom/server.js"
Expand Down Expand Up @@ -71,7 +72,7 @@ function generateChunksFromHtmlText(htmlString: string) {
}

async function generateWordpressRecords(
postsApi: wpdb.PostAPI[],
postsApi: PostRestApi[],
pageviews: Record<string, RawPageview>
): Promise<PageRecord[]> {
const getPostTypeAndImportance = (
Expand Down Expand Up @@ -187,11 +188,18 @@ const getPagesRecords = async () => {
const pageviews = await Pageview.getViewsByUrlObj()
const gdocs = await Gdoc.getPublishedGdocs()
const publishedGdocsBySlug = keyBy(gdocs, "slug")
const postsApi = await wpdb
.getPosts()
.then((posts) =>
posts.filter((post) => !publishedGdocsBySlug[`/${post.slug}`])
const slugsWithPublishedGdocsSuccessors =
await db.getSlugsWithPublishedGdocsSuccessors()
const postsApi = await wpdb.getPosts(undefined, (post) => {
// Two things can happen here:
// 1. There's a published Gdoc with the same slug
// 2. This post has a Gdoc successor (which might have a different slug)
// In either case, we don't want to index this WP post
return !(
publishedGdocsBySlug[post.slug] ||
slugsWithPublishedGdocsSuccessors.has(post.slug)
)
})

const countryRecords = generateCountryRecords(countries, pageviews)
const wordpressRecords = await generateWordpressRecords(postsApi, pageviews)
Expand Down
26 changes: 10 additions & 16 deletions baker/sitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
} from "../settings/serverSettings.js"
import { dayjs, countries, queryParamsToStr } from "@ourworldindata/utils"
import * as db from "../db/db.js"
import * as wpdb from "../db/wpdb.js"
import urljoin from "url-join"
import { countryProfileSpecs } from "../site/countryProfileProjects.js"
import { ExplorerAdminServer } from "../explorerAdminServer/ExplorerAdminServer.js"
Expand Down Expand Up @@ -57,20 +58,13 @@ const explorerToSitemapUrl = (program: ExplorerProgram): SitemapUrl[] => {
}

export const makeSitemap = async (explorerAdminServer: ExplorerAdminServer) => {
const posts = (await db
.knexTable("posts_with_gdoc_publish_status")
.where({ status: "publish", isGdocPublished: false })
.select("slug", "updated_at_in_wordpress")) as {
slug: string
updated_at_in_wordpress: Date
}[]
const gdocPosts = (await db
.knexTable(Gdoc.table)
.where({ published: true })
.select("slug", "updatedAt")) as {
slug: string
updatedAt: Date
}[]
const alreadyPublishedViaGdocsSlugsSet =
await db.getSlugsWithPublishedGdocsSuccessors()
const postsApi = await wpdb.getPosts(
undefined,
(postrow) => !alreadyPublishedViaGdocsSlugsSet.has(postrow.slug)
)
const gdocPosts = await Gdoc.getPublishedGdocs()
const charts = (await db
.knexTable(Chart.table)
.select(db.knexRaw(`updatedAt, config->>"$.slug" AS slug`))
Expand All @@ -94,9 +88,9 @@ export const makeSitemap = async (explorerAdminServer: ExplorerAdminServer) => {
})
)
.concat(
posts.map((p) => ({
postsApi.map((p) => ({
loc: urljoin(BAKED_BASE_URL, p.slug),
lastmod: dayjs(p.updated_at_in_wordpress).format("YYYY-MM-DD"),
lastmod: dayjs(p.modified_gmt).format("YYYY-MM-DD"),
}))
)
.concat(
Expand Down
18 changes: 18 additions & 0 deletions db/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,21 @@ export const knexTable = (table: string): Knex.QueryBuilder =>
knexInstance().table(table)

export const knexRaw = (str: string): Knex.Raw => knexInstance().raw(str)

/**
* In the backporting workflow, the users create gdoc posts for posts. As long as these are not yet published,
* we still want to bake them from the WP posts. Once the users presses publish there though, we want to stop
* baking them from the wordpress post. This funciton fetches all the slugs of posts that have been published via gdocs,
* to help us exclude them from the baking process.
*/
export const getSlugsWithPublishedGdocsSuccessors = async (): Promise<
Set<string>
> => {
return knexRaw(
`-- sql
select slug from posts_with_gdoc_publish_status
where isGdocPublished = TRUE`
)
.then((res) => res[0])
.then((rows) => new Set(rows.map((row: any) => row.slug)))
}
22 changes: 2 additions & 20 deletions db/wpdb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ export const getPosts = async (
postTypes: string[] = [WP_PostType.Post, WP_PostType.Page],
filterFunc?: FilterFnPostRestApi,
limit?: number
): Promise<any[]> => {
): Promise<PostRestApi[]> => {
if (!isWordpressAPIEnabled) return []

const perPage = 20
Expand Down Expand Up @@ -720,27 +720,9 @@ export const getBlockContent = async (

return post.data?.wpBlock?.content ?? undefined
}
export interface PostAPI {
id: number
type: WP_PostType
slug: string
title: {
rendered: string
}
date_gmt: string
modified_gmt: string
authors_name?: string[]
content: { rendered: string }
excerpt: { rendered: string }
featured_media_paths: {
medium_large: string
thumbnail: string
}
featured_media: number
}

export const getFullPost = async (
postApi: PostAPI,
postApi: PostRestApi,
excludeContent?: boolean
): Promise<FullPost> => ({
id: postApi.id,
Expand Down
37 changes: 37 additions & 0 deletions packages/@ourworldindata/utils/src/owidTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,43 @@ export interface PostRestApi {
latest?: boolean
}
}
id: number
date: string
date_gmt: string
guid: {
rendered: string
}
modified: string
modified_gmt: string

status: string
type: WP_PostType
link: string
title: {
rendered: string
}
content: {
rendered: string
protected: boolean
}
excerpt: {
rendered: string
protected: boolean
}
author: number
featured_media: number
comment_status: string
ping_status: string
sticky: boolean
template: string
format: string
categories: number[]
tags: any[]
authors_name: string[]
featured_media_paths: {
thumbnail: string
medium_large: string
}
}

export interface KeyInsight {
Expand Down

0 comments on commit f3d12b0

Please sign in to comment.