diff --git a/adminSiteServer/apiRouter.ts b/adminSiteServer/apiRouter.ts index b963fc502b2..b9e7ce1a4c6 100644 --- a/adminSiteServer/apiRouter.ts +++ b/adminSiteServer/apiRouter.ts @@ -135,7 +135,7 @@ import { getGdocBaseObjectById, setLinksForGdoc, setTagsForGdoc, - addImagesToContentGraph, + syncImagesAndAddToContentGraph, updateGdocContentOnly, upsertGdoc, } from "../db/model/Gdoc/GdocFactory.js" @@ -2276,7 +2276,6 @@ getRouteNonIdempotentWithRWTransaction( | undefined try { - // Beware: if contentSource=gdocs this will update images in the DB+S3 even if the gdoc is published const gdoc = await getAndLoadGdocById(trx, id, contentSource) if (!gdoc.published) { @@ -2368,7 +2367,7 @@ putRouteWithRWTransaction(apiRouter, "/gdocs/:id", async (req, res, trx) => { const nextGdoc = gdocFromJSON(req.body) await nextGdoc.loadState(trx) - await addImagesToContentGraph(trx, nextGdoc) + await syncImagesAndAddToContentGraph(trx, nextGdoc) await setLinksForGdoc( trx, diff --git a/baker/SiteBaker.tsx b/baker/SiteBaker.tsx index c5134f53c07..c1090271120 100644 --- a/baker/SiteBaker.tsx +++ b/baker/SiteBaker.tsx @@ -1,7 +1,7 @@ import fs from "fs-extra" import path from "path" import { glob } from "glob" -import { keyBy, without, uniq, mapValues, pick, chunk } from "lodash" +import { keyBy, without, uniq, mapValues, pick } from "lodash" import ProgressBar from "progress" import * as wpdb from "../db/wpdb.js" import * as db from "../db/db.js" @@ -100,10 +100,7 @@ import { getVariableMetadata, getVariableOfDatapageIfApplicable, } from "../db/model/Variable.js" -import { - gdocFromJSON, - getAllMinimalGdocBaseObjects, -} from "../db/model/Gdoc/GdocFactory.js" +import { getAllMinimalGdocBaseObjects } from "../db/model/Gdoc/GdocFactory.js" import { getBakePath } from "@ourworldindata/components" import { GdocAuthor } from "../db/model/Gdoc/GdocAuthor.js" import { DATA_INSIGHTS_ATOM_FEED_NAME } from "../site/gdocs/utils.js" @@ -307,7 +304,6 @@ export class SiteBaker { picks?: [string[], string[], string[], string[]] ): Promise { if (!this._prefetchedAttachmentsCache) { - console.log("Prefetching attachments") const publishedGdocs = await getAllMinimalGdocBaseObjects(knex) const publishedGdocsDictionary = keyBy(publishedGdocs, "id") @@ -332,31 +328,23 @@ export class SiteBaker { // Includes redirects const publishedChartsRaw = await mapSlugsToConfigs(knex) - const publishedCharts: LinkedChart[] = [] - - for (const publishedChartsRawChunk of chunk( - publishedChartsRaw, - 20 - )) { - await Promise.all( - publishedChartsRawChunk.map(async (chart) => { - const tab = chart.config.tab ?? GrapherTabOption.chart - const datapageIndicator = - await getVariableOfDatapageIfApplicable( - chart.config - ) - publishedCharts.push({ - originalSlug: chart.slug, - resolvedUrl: `${BAKED_GRAPHER_URL}/${chart.config.slug}`, - tab, - title: chart.config.title || "", - thumbnail: `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${chart.config.slug}.svg`, - indicatorId: datapageIndicator?.id, - tags: [], - }) - }) - ) - } + const publishedCharts: LinkedChart[] = await Promise.all( + publishedChartsRaw.map(async (chart) => { + const tab = chart.config.tab ?? GrapherTabOption.chart + const datapageIndicator = + await getVariableOfDatapageIfApplicable(chart.config) + return { + originalSlug: chart.slug, + resolvedUrl: `${BAKED_GRAPHER_URL}/${chart.config.slug}`, + tab, + queryString: "", + title: chart.config.title || "", + thumbnail: `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${chart.config.slug}.svg`, + indicatorId: datapageIndicator?.id, + tags: [], + } + }) + ) const publishedChartsBySlug = keyBy(publishedCharts, "originalSlug") const publishedChartsWithIndicatorIds = publishedCharts.filter( @@ -498,10 +486,7 @@ export class SiteBaker { // TODO: this transaction is only RW because somewhere inside it we fetch images async bakeGDocPosts(knex: db.KnexReadWriteTransaction, slugs?: string[]) { if (!this.bakeSteps.has("gdocPosts")) return - // We don't need to load these as we prefetch all attachments - const publishedGdocs = await db - .getPublishedGdocPosts(knex) - .then((gdocs) => gdocs.map(gdocFromJSON)) + const publishedGdocs = await GdocPost.getPublishedGdocPosts(knex) const gdocsToBake = slugs !== undefined @@ -534,9 +519,7 @@ export class SiteBaker { publishedGdoc.linkedIndicators = attachments.linkedIndicators // this is a no-op if the gdoc doesn't have an all-chart block - if ("loadRelatedCharts" in publishedGdoc) { - await publishedGdoc.loadRelatedCharts(knex) - } + await publishedGdoc.loadRelatedCharts(knex) await publishedGdoc.validate(knex) if ( diff --git a/baker/algolia/algoliaUtils.tsx b/baker/algolia/algoliaUtils.tsx index e531bc9369d..ea93fe75cfe 100644 --- a/baker/algolia/algoliaUtils.tsx +++ b/baker/algolia/algoliaUtils.tsx @@ -23,6 +23,7 @@ import { SearchIndexName, } from "../../site/search/searchTypes.js" import { getAnalyticsPageviewsByUrlObj } from "../../db/model/Pageview.js" +import { GdocPost } from "../../db/model/Gdoc/GdocPost.js" import { ArticleBlocks } from "../../site/gdocs/components/ArticleBlocks.js" import React from "react" import { @@ -33,8 +34,6 @@ import { import { getIndexName } from "../../site/search/searchClient.js" import { ObjectWithObjectID } from "@algolia/client-search" import { SearchIndex } from "algoliasearch" -import { match, P } from "ts-pattern" -import { gdocFromJSON } from "../../db/model/Gdoc/GdocFactory.js" interface TypeAndImportance { type: PageType @@ -147,27 +146,20 @@ function generateGdocRecords( const getPostTypeAndImportance = ( gdoc: OwidGdocPostInterface ): TypeAndImportance => { - return match(gdoc.content.type) - .with(OwidGdocType.Article, () => ({ - type: "article" as const, - importance: 0, - })) - .with(OwidGdocType.AboutPage, () => ({ - type: "about" as const, - importance: 1, - })) - .with( - P.union(OwidGdocType.TopicPage, OwidGdocType.LinearTopicPage), - () => ({ - type: "topic" as const, - importance: 3, - }) - ) - .with(P.union(OwidGdocType.Fragment, undefined), () => ({ - type: "other" as const, - importance: 0, - })) - .exhaustive() + switch (gdoc.content.type) { + case OwidGdocType.TopicPage: + return { type: "topic", importance: 3 } + case OwidGdocType.LinearTopicPage: + return { type: "topic", importance: 3 } + case OwidGdocType.Fragment: + // this should not happen because we filter out fragments; but we want to have an exhaustive switch/case so we include it + return { type: "other", importance: 0 } + case OwidGdocType.AboutPage: + return { type: "about", importance: 0 } + case OwidGdocType.Article: + case undefined: + return { type: "article", importance: 0 } + } } const records: PageRecord[] = [] @@ -210,11 +202,9 @@ function generateGdocRecords( // Generate records for countries, WP posts (not including posts that have been succeeded by Gdocs equivalents), and Gdocs export const getPagesRecords = async (knex: db.KnexReadWriteTransaction) => { const pageviews = await getAnalyticsPageviewsByUrlObj(knex) - const gdocs = await db - .getPublishedGdocPosts(knex) - .then((gdocs) => gdocs.map(gdocFromJSON) as OwidGdocPostInterface[]) - + const gdocs = await GdocPost.getPublishedGdocPosts(knex) const publishedGdocsBySlug = keyBy(gdocs, "slug") + // TODO: the knex instance should be handed down as a parameter const slugsWithPublishedGdocsSuccessors = await db.getSlugsWithPublishedGdocsSuccessors(knex) const postsApi = await getPostsFromSnapshots(knex, undefined, (post) => { diff --git a/baker/algolia/indexToAlgolia.tsx b/baker/algolia/indexToAlgolia.tsx index 67015b21041..540f36f5f08 100644 --- a/baker/algolia/indexToAlgolia.tsx +++ b/baker/algolia/indexToAlgolia.tsx @@ -25,8 +25,6 @@ const indexToAlgolia = async () => { await index.replaceAllObjects(records) await wpdb.singleton.end() - - process.exit(0) } process.on("unhandledRejection", (e) => { diff --git a/baker/sitemap.ts b/baker/sitemap.ts index c02c799f866..544727f9562 100644 --- a/baker/sitemap.ts +++ b/baker/sitemap.ts @@ -15,6 +15,7 @@ import { countryProfileSpecs } from "../site/countryProfileProjects.js" import { ExplorerAdminServer } from "../explorerAdminServer/ExplorerAdminServer.js" import { EXPLORERS_ROUTE_FOLDER } from "../explorer/ExplorerConstants.js" import { ExplorerProgram } from "../explorer/ExplorerProgram.js" +import { GdocPost } from "../db/model/Gdoc/GdocPost.js" import { getPostsFromSnapshots } from "../db/model/Post.js" import { calculateDataInsightIndexPageCount } from "../db/model/Gdoc/gdocUtils.js" @@ -73,7 +74,7 @@ export const makeSitemap = async ( undefined, (postrow) => !alreadyPublishedViaGdocsSlugsSet.has(postrow.slug) ) - const gdocPosts = await db.getPublishedGdocPosts(knex) + const gdocPosts = await GdocPost.getPublishedGdocPosts(knex) const publishedDataInsights = await db.getPublishedDataInsights(knex) const dataInsightFeedPageCount = calculateDataInsightIndexPageCount( diff --git a/db/db.ts b/db/db.ts index 288bf226dc4..a89bddf2eed 100644 --- a/db/db.ts +++ b/db/db.ts @@ -10,12 +10,8 @@ import { registerExitHandler } from "./cleanup.js" import { keyBy } from "@ourworldindata/utils" import { DbChartTagJoin, - DbEnrichedPostGdoc, - DbRawPostGdoc, - ImageMetadata, MinimalDataInsightInterface, OwidGdocType, - parsePostsGdocsRow, } from "@ourworldindata/types" // Return the first match from a mysql query @@ -314,50 +310,3 @@ export const getHomepageId = ( AND published = TRUE` ).then((result) => result?.id) } - -export const getImageMetadataByFilenames = async ( - knex: KnexReadonlyTransaction, - filenames: string[] -): Promise> => { - if (filenames.length === 0) return {} - const rows = await knexRaw( - knex, - `-- sql - SELECT - id, - googleId, - filename, - defaultAlt, - updatedAt, - originalWidth, - originalHeight - FROM - images - WHERE filename IN (?)`, - [filenames] - ) - return keyBy(rows, "filename") -} - -export const getPublishedGdocPosts = async ( - knex: KnexReadonlyTransaction -): Promise => { - return knexRaw( - knex, - `-- sql - SELECT * - FROM posts_gdocs - WHERE published = 1 - AND content ->> '$.type' IN (:types) - AND publishedAt <= NOW() - ORDER BY publishedAt DESC`, - { - types: [ - OwidGdocType.Article, - OwidGdocType.LinearTopicPage, - OwidGdocType.TopicPage, - OwidGdocType.AboutPage, - ], - } - ).then((rows) => rows.map(parsePostsGdocsRow)) -} diff --git a/db/model/Gdoc/GdocAuthor.ts b/db/model/Gdoc/GdocAuthor.ts index 7d6d60147e1..21709633fcf 100644 --- a/db/model/Gdoc/GdocAuthor.ts +++ b/db/model/Gdoc/GdocAuthor.ts @@ -43,14 +43,16 @@ export class GdocAuthor extends GdocBase implements OwidGdocAuthorInterface { return blocks } + // TODO: this transaction is only RW because somewhere inside it we fetch images _loadSubclassAttachments = ( - knex: db.KnexReadonlyTransaction + knex: db.KnexReadWriteTransaction ): Promise => { return this.loadLatestWorkImages(knex) } + // TODO: this transaction is only RW because somewhere inside it we fetch images loadLatestWorkImages = async ( - knex: db.KnexReadonlyTransaction + knex: db.KnexReadWriteTransaction ): Promise => { if (!this.content.title) return @@ -75,7 +77,7 @@ export class GdocAuthor extends GdocBase implements OwidGdocAuthorInterface { // Load the image metadata for the latest work images, including the // default featured image which is used as a fallback in the entire // research and writing block - return super.loadImageMetadataFromDB(knex, [ + return super.loadImageMetadata(knex, [ ...latestWorkImageFilenames, DEFAULT_GDOC_FEATURED_IMAGE, ]) diff --git a/db/model/Gdoc/GdocBase.ts b/db/model/Gdoc/GdocBase.ts index 443e78ccc6d..543d5f83425 100644 --- a/db/model/Gdoc/GdocBase.ts +++ b/db/model/Gdoc/GdocBase.ts @@ -5,6 +5,7 @@ import { LinkedIndicator, keyBy, ImageMetadata, + excludeUndefined, OwidGdocErrorMessage, OwidGdocErrorMessageType, excludeNullish, @@ -31,6 +32,7 @@ import { BAKED_GRAPHER_URL } from "../../../settings/serverSettings.js" import { google } from "googleapis" import { gdocToArchie } from "./gdocToArchie.js" import { archieToEnriched } from "./archieToEnriched.js" +import { imageStore } from "../Image.js" import { getChartConfigById, mapSlugsToIds } from "../Chart.js" import { BAKED_BASE_URL, @@ -84,8 +86,9 @@ export class GdocBase implements OwidGdocBaseInterface { ) => Promise = async () => [] _omittableFields: string[] = [] + // TODO: this transaction is only RW because somewhere inside it we fetch images _loadSubclassAttachments: ( - knex: db.KnexReadonlyTransaction + knex: db.KnexReadWriteTransaction ) => Promise = async () => undefined constructor(id?: string) { @@ -559,27 +562,29 @@ export class GdocBase implements OwidGdocBaseInterface { const slugToIdMap = await mapSlugsToIds(knex) // TODO: rewrite this as a single query instead of N queries const linkedGrapherCharts = await Promise.all( - this.linkedChartSlugs.grapher.map(async (originalSlug) => { - const chartId = slugToIdMap[originalSlug] - if (!chartId) return - const chart = await getChartConfigById(knex, chartId) - if (!chart) return - const resolvedSlug = chart.config.slug ?? "" - const resolvedTitle = chart.config.title ?? "" - const tab = chart.config.tab ?? GrapherTabOption.chart - const datapageIndicator = - await getVariableOfDatapageIfApplicable(chart.config) - const linkedChart: LinkedChart = { - originalSlug, - title: resolvedTitle, - tab, - resolvedUrl: `${BAKED_GRAPHER_URL}/${resolvedSlug}`, - thumbnail: `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${resolvedSlug}.svg`, - tags: [], - indicatorId: datapageIndicator?.id, + [...this.linkedChartSlugs.grapher.values()].map( + async (originalSlug) => { + const chartId = slugToIdMap[originalSlug] + if (!chartId) return + const chart = await getChartConfigById(knex, chartId) + if (!chart) return + const resolvedSlug = chart.config.slug ?? "" + const resolvedTitle = chart.config.title ?? "" + const tab = chart.config.tab ?? GrapherTabOption.chart + const datapageIndicator = + await getVariableOfDatapageIfApplicable(chart.config) + const linkedChart: LinkedChart = { + originalSlug, + title: resolvedTitle, + tab, + resolvedUrl: `${BAKED_GRAPHER_URL}/${resolvedSlug}`, + thumbnail: `${BAKED_GRAPHER_EXPORTS_BASE_URL}/${resolvedSlug}.svg`, + tags: [], + indicatorId: datapageIndicator?.id, + } + return linkedChart } - return linkedChart - }) + ) ).then(excludeNullish) const publishedExplorersBySlug = @@ -636,25 +641,26 @@ export class GdocBase implements OwidGdocBaseInterface { this.linkedDocuments = keyBy(linkedDocuments, "id") } - /** - * Load image metadata from the database. Does not check Google Drive or sync to S3 - */ - async loadImageMetadataFromDB( - knex: db.KnexReadonlyTransaction, + // TODO: this transaction is only RW because somewhere inside it we fetch images + async loadImageMetadata( + knex: db.KnexReadWriteTransaction, filenames?: string[] ): Promise { const imagesFilenames = filenames ?? this.linkedImageFilenames if (!imagesFilenames.length) return - const imageMetadata = await db.getImageMetadataByFilenames( - knex, - imagesFilenames - ) + await imageStore.fetchImageMetadata(imagesFilenames) + const images = await imageStore + .syncImagesToS3(knex) + .then(excludeUndefined) + // Merge the new image metadata with the existing image metadata. This + // is used by GdocAuthor to load additional image metadata from the + // latest work section. this.imageMetadata = { ...this.imageMetadata, - ...keyBy(imageMetadata, "filename"), + ...keyBy(images, "filename"), } } @@ -778,9 +784,10 @@ export class GdocBase implements OwidGdocBaseInterface { ] } - async loadState(knex: db.KnexReadonlyTransaction): Promise { + // TODO: this transaction is only RW because somewhere inside it we fetch images + async loadState(knex: db.KnexReadWriteTransaction): Promise { await this.loadLinkedDocuments(knex) - await this.loadImageMetadataFromDB(knex) + await this.loadImageMetadata(knex) await this.loadLinkedCharts(knex) await this.loadLinkedIndicators() // depends on linked charts await this._loadSubclassAttachments(knex) diff --git a/db/model/Gdoc/GdocFactory.ts b/db/model/Gdoc/GdocFactory.ts index ec7ac324eda..b1a211c4299 100644 --- a/db/model/Gdoc/GdocFactory.ts +++ b/db/model/Gdoc/GdocFactory.ts @@ -34,12 +34,10 @@ import { KnexReadonlyTransaction, knexRaw, KnexReadWriteTransaction, - getImageMetadataByFilenames, - getPublishedGdocPosts, } from "../../db.js" import { enrichedBlocksToMarkdown } from "./enrichedToMarkdown.js" import { GdocAuthor } from "./GdocAuthor.js" -import { fetchImagesFromDriveAndSyncToS3 } from "../Image.js" +import { imageStore } from "../Image.js" export function gdocFromJSON( json: Record @@ -340,9 +338,6 @@ export async function loadGdocFromGdocBase( if (contentSource === GdocsContentSource.Gdocs) { // TODO: if we get here via fromJSON then we have already done this - optimize that? await gdoc.fetchAndEnrichGdoc() - // If we're loading from Gdocs, now's also the time to fetch images from gdrive and sync them to S3 - // In any other case, the images should already be in the DB and S3 - await fetchImagesFromDriveAndSyncToS3(knex, gdoc.filenames) } await gdoc.loadState(knex) @@ -400,7 +395,24 @@ export async function getAndLoadPublishedDataInsights( export async function getAndLoadPublishedGdocPosts( knex: KnexReadWriteTransaction ): Promise { - const rows = await getPublishedGdocPosts(knex) + const rows = await knexRaw( + knex, + `-- sql + SELECT * + FROM posts_gdocs + WHERE published = 1 + AND content ->> '$.type' IN (:types) + AND publishedAt <= NOW() + ORDER BY publishedAt DESC`, + { + types: [ + OwidGdocType.Article, + OwidGdocType.LinearTopicPage, + OwidGdocType.TopicPage, + OwidGdocType.AboutPage, + ], + } + ) const ids = rows.map((row) => row.id) const tags = await knexRaw( knex, @@ -414,7 +426,7 @@ export async function getAndLoadPublishedGdocPosts( const groupedTags = groupBy(tags, "gdocId") const enrichedRows = rows.map((row) => { return { - ...row, + ...parsePostsGdocsRow(row), tags: groupedTags[row.id] ? groupedTags[row.id] : null, } satisfies OwidGdocBaseInterface }) @@ -586,7 +598,7 @@ export async function getAllGdocIndexItemsOrderedByUpdatedAt( ) } -export async function addImagesToContentGraph( +export async function syncImagesAndAddToContentGraph( trx: KnexReadWriteTransaction, gdoc: GdocPost | GdocDataInsight | GdocHomepage | GdocAuthor ): Promise { @@ -598,13 +610,16 @@ export async function addImagesToContentGraph( // Includes fragments so that images in data pages are // synced to S3 and ultimately baked in bakeDriveImages(). if (filenames.length && gdoc.published) { - const images = await getImageMetadataByFilenames(trx, filenames) + await imageStore.fetchImageMetadata(filenames) + const images = await imageStore.syncImagesToS3(trx) const gdocXImagesToInsert: DbInsertPostGdocXImage[] = [] - for (const image of Object.values(images)) { - gdocXImagesToInsert.push({ - gdocId: gdoc.id, - imageId: image.id, - }) + for (const image of images) { + if (image) { + gdocXImagesToInsert.push({ + gdocId: gdoc.id, + imageId: image.id, + }) + } } try { await trx diff --git a/db/model/Gdoc/GdocPost.ts b/db/model/Gdoc/GdocPost.ts index 712625c11d7..b77254be481 100644 --- a/db/model/Gdoc/GdocPost.ts +++ b/db/model/Gdoc/GdocPost.ts @@ -23,8 +23,15 @@ import { ADMIN_BASE_URL } from "../../../settings/clientSettings.js" import { parseDetails, parseFaqs } from "./rawToEnriched.js" import { htmlToEnrichedTextBlock } from "./htmlToEnriched.js" import { GdocBase } from "./GdocBase.js" -import { KnexReadonlyTransaction, knexRaw } from "../../db.js" -import { getGdocBaseObjectById } from "./GdocFactory.js" +import { + KnexReadWriteTransaction, + KnexReadonlyTransaction, + knexRaw, +} from "../../db.js" +import { + getGdocBaseObjectById, + getAndLoadPublishedGdocPosts, +} from "./GdocFactory.js" export class GdocPost extends GdocBase implements OwidGdocPostInterface { content!: OwidGdocPostContent @@ -228,4 +235,23 @@ export class GdocPost extends GdocBase implements OwidGdocPostInterface { return parseDetails(gdoc.content.details) } + + // TODO: this transaction is only RW because somewhere inside it we fetch images + static async getPublishedGdocPosts( + knex: KnexReadWriteTransaction + ): Promise { + // #gdocsvalidation this cast means that we trust the admin code and + // workflow to provide published articles that have all the required content + // fields (see #gdocsvalidationclient and pending #gdocsvalidationserver). + // It also means that if a required field is added after the publication of + // an article, there won't currently be any checks preventing the then + // incomplete article to be republished (short of an error being raised down + // the line). A migration should then be added to update current articles + // with a sensible default for the new required content field. An + // alternative would be to encapsulate that default in + // mapGdocsToWordpressPosts(). This would make the Gdoc entity coming from + // the database dependent on the mapping function, which is more practical + // but also makes it less of a source of truth when considered in isolation. + return getAndLoadPublishedGdocPosts(knex) + } } diff --git a/db/model/Image.ts b/db/model/Image.ts index 03cd745f83a..3d9f11144c2 100644 --- a/db/model/Image.ts +++ b/db/model/Image.ts @@ -32,9 +32,9 @@ import { import { KnexReadWriteTransaction, KnexReadonlyTransaction } from "../db.js" class ImageStore { - async fetchImageMetadata( - filenames: string[] - ): Promise> { + images: Record | undefined + + async fetchImageMetadata(filenames: string[]): Promise { console.log( `Fetching image metadata from Google Drive ${ filenames.length ? `for ${filenames.join(", ")}` : "" @@ -96,8 +96,8 @@ class ImageStore { filename: google.name, defaultAlt: google.description ?? "", updatedAt: new Date(google.modifiedTime).getTime(), - originalWidth: google.imageMediaMetadata?.width ?? null, - originalHeight: google.imageMediaMetadata?.height ?? null, + originalWidth: google.imageMediaMetadata?.width, + originalHeight: google.imageMediaMetadata?.height, })) const duplicateFilenames = findDuplicates( @@ -110,23 +110,13 @@ class ImageStore { ) } - console.log( - `Fetched ${images.length} images' metadata from Google Drive` - ) - const imageMetadata = keyBy(images, "filename") - // Only applies when we're fetching specific images i.e. `filenames` is not empty - for (const filename of filenames) { - if (!imageMetadata[filename]) { - throw Error(`Image ${filename} not found in Google Drive`) - } - } - return imageMetadata + this.images = keyBy(images, "filename") } async syncImagesToS3( - knex: KnexReadWriteTransaction, - images: Record + knex: KnexReadWriteTransaction ): Promise<(Image | undefined)[]> { + const images = this.images if (!images) return [] return Promise.all( Object.keys(images).map((filename) => @@ -147,7 +137,6 @@ export const s3Client = new S3Client({ secretAccessKey: IMAGE_HOSTING_R2_SECRET_ACCESS_KEY, }, }) - export class Image implements ImageMetadata { id!: number googleId!: string @@ -185,7 +174,7 @@ export class Image implements ImageMetadata { static async syncImage( knex: KnexReadWriteTransaction, metadata: ImageMetadata - ): Promise { + ): Promise { const fresh = new Image(metadata) const stored = await getImageByFilename(knex, metadata.filename) @@ -194,14 +183,12 @@ export class Image implements ImageMetadata { if ( stored.updatedAt !== fresh.updatedAt || stored.defaultAlt !== fresh.defaultAlt || - stored.originalWidth !== fresh.originalWidth || - stored.originalHeight !== fresh.originalHeight + stored.originalWidth !== fresh.originalWidth ) { await fresh.fetchFromDriveAndUploadToS3() stored.updatedAt = fresh.updatedAt stored.defaultAlt = fresh.defaultAlt stored.originalWidth = fresh.originalWidth - stored.originalHeight = fresh.originalHeight await updateImage(knex, stored.id, { updatedAt: fresh.updatedAt, defaultAlt: fresh.defaultAlt, @@ -217,8 +204,9 @@ export class Image implements ImageMetadata { return fresh } } catch (e) { - throw new Error(`Error syncing image ${metadata.filename}: ${e}`) + console.error(`Error syncing ${fresh.filename}`, e) } + return } async fetchFromDriveAndUploadToS3(): Promise { @@ -309,22 +297,3 @@ export async function insertImageObject( const [id] = await knex.table("images").insert(image) return id } - -export async function fetchImagesFromDriveAndSyncToS3( - knex: KnexReadWriteTransaction, - filenames: string[] = [] -): Promise { - if (!filenames.length) return [] - - try { - const imageMetadata = await imageStore.fetchImageMetadata(filenames) - - const metadataArray = Object.values(imageMetadata) as ImageMetadata[] - - return Promise.all( - metadataArray.map((metadata) => Image.syncImage(knex, metadata)) - ) - } catch (e) { - throw new Error(`Error fetching images from Drive: ${e}`) - } -} diff --git a/devTools/updateImageHeights/update-image-heights.ts b/devTools/updateImageHeights/update-image-heights.ts index d58f89770e2..e8938c91ea9 100644 --- a/devTools/updateImageHeights/update-image-heights.ts +++ b/devTools/updateImageHeights/update-image-heights.ts @@ -15,10 +15,10 @@ async function updateImageHeights() { .then((rows) => rows.map((row) => row.filename)) console.log("Fetching image metadata...") - const images = await imageStore.fetchImageMetadata([]) + await imageStore.fetchImageMetadata([]) console.log("Fetching image metadata...done") - if (!images) { + if (!imageStore.images) { throw new Error("No images found") } @@ -28,7 +28,7 @@ async function updateImageHeights() { for (const batch of lodash.chunk(filenames, 20)) { const promises = [] for (const filename of batch) { - const image = images[filename] + const image = imageStore.images[filename] if (image && image.originalHeight) { promises.push( db.knexRaw( diff --git a/packages/@ourworldindata/types/src/gdocTypes/Image.ts b/packages/@ourworldindata/types/src/gdocTypes/Image.ts index f9e162ac31c..4c56fc9e9fd 100644 --- a/packages/@ourworldindata/types/src/gdocTypes/Image.ts +++ b/packages/@ourworldindata/types/src/gdocTypes/Image.ts @@ -1,5 +1,3 @@ -import { DbEnrichedImage } from "../dbTypes/Images.js" - // This is the JSON we get from Google's API before remapping the keys to be consistent with the rest of our interfaces export interface GDriveImageMetadata { name: string // -> filename @@ -12,14 +10,13 @@ export interface GDriveImageMetadata { } } -// All the data we use in the client to render images -// everything except the ID, effectively -export type ImageMetadata = Pick< - DbEnrichedImage, - | "defaultAlt" - | "filename" - | "googleId" - | "originalHeight" - | "originalWidth" - | "updatedAt" -> +export interface ImageMetadata { + googleId: string + filename: string + defaultAlt: string + // MySQL Date objects round to the nearest second, whereas Google includes milliseconds + // so we store as an epoch to avoid any conversion issues + updatedAt: number | null + originalWidth?: number | null + originalHeight?: number | null +}