diff --git a/.env.devcontainer b/.env.devcontainer index 7bc29cc6500..dc8a8bd8d54 100644 --- a/.env.devcontainer +++ b/.env.devcontainer @@ -16,8 +16,8 @@ GDOCS_CLIENT_ID='' GDOCS_BASIC_ARTICLE_TEMPLATE_URL='' GDOCS_SHARED_DRIVE_ID='' -IMAGE_HOSTING_R2_ENDPOINT='' +R2_ENDPOINT='' IMAGE_HOSTING_R2_CDN_URL='' IMAGE_HOSTING_R2_BUCKET_PATH='' -IMAGE_HOSTING_R2_ACCESS_KEY_ID='' -IMAGE_HOSTING_R2_SECRET_ACCESS_KEY='' +R2_ACCESS_KEY_ID='' +R2_SECRET_ACCESS_KEY='' diff --git a/.env.example-full b/.env.example-full index 8407d05b69b..d0cb5063adf 100644 --- a/.env.example-full +++ b/.env.example-full @@ -22,11 +22,17 @@ GDOCS_BASIC_ARTICLE_TEMPLATE_URL= GDOCS_SHARED_DRIVE_ID= GDOCS_DONATE_FAQS_DOCUMENT_ID= # optional -IMAGE_HOSTING_R2_ENDPOINT= # optional +R2_ENDPOINT= # optional IMAGE_HOSTING_R2_CDN_URL= IMAGE_HOSTING_R2_BUCKET_PATH= -IMAGE_HOSTING_R2_ACCESS_KEY_ID= # optional -IMAGE_HOSTING_R2_SECRET_ACCESS_KEY= # optional +R2_ACCESS_KEY_ID= # optional +R2_SECRET_ACCESS_KEY= # optional +# These two GRAPHER_CONFIG_ settings are used to store grapher configs in an R2 bucket. +# The cloudflare workers for thumbnail rendering etc use these settings to fetch the grapher configs. +# This means that for most local dev it is not necessary to set these. +GRAPHER_CONFIG_R2_BUCKET= # optional - for local dev set it to "owid-grapher-configs-staging" +GRAPHER_CONFIG_R2_BUCKET_PATH= # optional - for local dev set it to "devs/YOURNAME" + OPENAI_API_KEY= diff --git a/.gitignore b/.gitignore index 388475e6261..e33cb39db80 100755 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,4 @@ dist/ .nx/workspace-data .dev.vars **/tsup.config.bundled*.mjs +cfstorage/ diff --git a/adminSiteServer/apiRouter.ts b/adminSiteServer/apiRouter.ts index 423baba558b..800f9d0c7b9 100644 --- a/adminSiteServer/apiRouter.ts +++ b/adminSiteServer/apiRouter.ts @@ -9,7 +9,11 @@ import { ADMIN_BASE_URL, DATA_API_URL, } from "../settings/serverSettings.js" -import { expectInt, isValidSlug } from "../serverUtils/serverUtil.js" +import { + Base64String, + expectInt, + isValidSlug, +} from "../serverUtils/serverUtil.js" import { OldChartFieldList, assignTagsForCharts, @@ -167,6 +171,13 @@ import { GdocDataInsight } from "../db/model/Gdoc/GdocDataInsight.js" import { GdocHomepage } from "../db/model/Gdoc/GdocHomepage.js" import { GdocAuthor } from "../db/model/Gdoc/GdocAuthor.js" import path from "path" +import { + deleteGrapherConfigFromR2, + deleteGrapherConfigFromR2ByUUID, + R2GrapherConfigDirectory, + saveGrapherConfigToR2, + saveGrapherConfigToR2ByUUID, +} from "./chartConfigR2Helpers.js" const apiRouter = new FunctionalRouter() @@ -303,7 +314,7 @@ const saveNewChart = async ( // new charts inherit by default shouldInherit = true, }: { config: GrapherInterface; user: DbPlainUser; shouldInherit?: boolean } -): Promise => { +): Promise<{ patchConfig: GrapherInterface; fullConfig: GrapherInterface }> => { // grab the parent of the chart if inheritance should be enabled const parent = shouldInherit ? await getParentByChartConfig(knex, config) @@ -316,9 +327,10 @@ const saveNewChart = async ( // compute patch and full configs const patchConfig = diffGrapherConfigs(config, fullParentConfig) const fullConfig = mergeGrapherConfigs(fullParentConfig, patchConfig) + const fullConfigStringified = serializeChartConfig(fullConfig) // insert patch & full configs into the chart_configs table - const configId = uuidv7() + const chartConfigId = uuidv7() await db.knexRaw( knex, `-- sql @@ -326,9 +338,9 @@ const saveNewChart = async ( VALUES (?, ?, ?) `, [ - configId, + chartConfigId, serializeChartConfig(patchConfig), - serializeChartConfig(fullConfig), + fullConfigStringified, ] ) @@ -339,7 +351,7 @@ const saveNewChart = async ( INSERT INTO charts (configId, isInheritanceEnabled, lastEditedAt, lastEditedByUserId) VALUES (?, ?, ?, ?) `, - [configId, shouldInherit, new Date(), user.id] + [chartConfigId, shouldInherit, new Date(), user.id] ) // The chart config itself has an id field that should store the id of the chart - update the chart now so this is true @@ -359,7 +371,25 @@ const saveNewChart = async ( [chartId, chartId, chartId] ) - return patchConfig + // We need to get the full config and the md5 hash from the database instead of + // computing our own md5 hash because MySQL normalizes JSON and our + // client computed md5 would be different from the ones computed by and stored in R2 + const fullConfigMd5 = await db.knexRawFirst< + Pick + >( + knex, + `-- sql + select full, fullMd5 from chart_configs where id = ?`, + [chartConfigId] + ) + + await saveGrapherConfigToR2ByUUID( + chartConfigId, + fullConfigMd5!.full, + fullConfigMd5!.fullMd5 as Base64String + ) + + return { patchConfig, fullConfig } } const updateExistingChart = async ( @@ -372,7 +402,7 @@ const updateExistingChart = async ( // if true or false, enable or disable inheritance shouldInherit?: boolean } -): Promise => { +): Promise<{ patchConfig: GrapherInterface; fullConfig: GrapherInterface }> => { const { config, user, chartId } = params // make sure that the id of the incoming config matches the chart id @@ -393,22 +423,31 @@ const updateExistingChart = async ( // compute patch and full configs const patchConfig = diffGrapherConfigs(config, fullParentConfig) const fullConfig = mergeGrapherConfigs(fullParentConfig, patchConfig) + const fullConfigStringified = serializeChartConfig(fullConfig) + + const chartConfigId = await db.knexRawFirst>( + knex, + `SELECT configId FROM charts WHERE id = ?`, + [chartId] + ) + + if (!chartConfigId) + throw new JsonError(`No chart config found for id ${chartId}`, 404) // update configs await db.knexRaw( knex, `-- sql - UPDATE chart_configs cc - JOIN charts c ON c.configId = cc.id + UPDATE chart_configs SET - cc.patch=?, - cc.full=? - WHERE c.id = ? + patch=?, + full=? + WHERE id = ? `, [ serializeChartConfig(patchConfig), - serializeChartConfig(fullConfig), - chartId, + fullConfigStringified, + chartConfigId.configId, ] ) @@ -423,7 +462,25 @@ const updateExistingChart = async ( [shouldInherit, new Date(), user.id, chartId] ) - return patchConfig + // We need to get the full config and the md5 hash from the database instead of + // computing our own md5 hash because MySQL normalizes JSON and our + // client computed md5 would be different from the ones computed by and stored in R2 + const fullConfigMd5 = await db.knexRawFirst< + Pick + >( + knex, + `-- sql + select full, fullMd5 from chart_configs where id = ?`, + [chartConfigId.configId] + ) + + await saveGrapherConfigToR2ByUUID( + chartConfigId.configId, + fullConfigMd5!.full, + fullConfigMd5!.fullMd5 as Base64String + ) + + return { patchConfig, fullConfig } } const saveGrapher = async ( @@ -505,6 +562,11 @@ const saveGrapher = async ( `INSERT INTO chart_slug_redirects (chart_id, slug) VALUES (?, ?)`, [existingConfig.id, existingConfig.slug] ) + // When we rename grapher configs, make sure to delete the old one (the new one will be saved below) + await deleteGrapherConfigFromR2( + R2GrapherConfigDirectory.publishedGrapherBySlug, + `${existingConfig.slug}.json` + ) } } @@ -540,28 +602,34 @@ const saveGrapher = async ( // Execute the actual database update or creation let chartId: number + let patchConfig: GrapherInterface + let fullConfig: GrapherInterface if (existingConfig) { chartId = existingConfig.id! - newConfig = await updateExistingChart(knex, { + const configs = await updateExistingChart(knex, { config: newConfig, user, chartId, shouldInherit, }) + patchConfig = configs.patchConfig + fullConfig = configs.fullConfig } else { - newConfig = await saveNewChart(knex, { + const configs = await saveNewChart(knex, { config: newConfig, user, shouldInherit, }) - chartId = newConfig.id! + patchConfig = configs.patchConfig + fullConfig = configs.fullConfig + chartId = fullConfig.id! } // Record this change in version history const chartRevisionLog = { chartId: chartId as number, userId: user.id, - config: serializeChartConfig(newConfig), + config: serializeChartConfig(patchConfig), createdAt: new Date(), updatedAt: new Date(), } satisfies DbInsertChartRevision @@ -583,7 +651,7 @@ const saveGrapher = async ( chartId, ]) - const newDimensions = newConfig.dimensions ?? [] + const newDimensions = fullConfig.dimensions ?? [] for (const [i, dim] of newDimensions.entries()) { await db.knexRaw( knex, @@ -593,15 +661,38 @@ const saveGrapher = async ( } // So we can generate country profiles including this chart data - if (newConfig.isPublished && referencedVariablesMightChange) + if (fullConfig.isPublished && referencedVariablesMightChange) // TODO: remove this ad hoc knex transaction context when we switch the function to knex await denormalizeLatestCountryData( knex, newDimensions.map((d) => d.variableId) ) + if (fullConfig.isPublished) { + // We need to get the full config and the md5 hash from the database instead of + // computing our own md5 hash because MySQL normalizes JSON and our + // client computed md5 would be different from the ones computed by and stored in R2 + const fullConfigMd5 = await db.knexRawFirst< + Pick + >( + knex, + `-- sql + select cc.full, cc.fullMd5 from chart_configs cc + join charts c on c.configId = cc.id + where c.id = ?`, + [chartId] + ) + + await saveGrapherConfigToR2( + fullConfigMd5!.full, + R2GrapherConfigDirectory.publishedGrapherBySlug, + `${fullConfig.slug}.json`, + fullConfigMd5!.fullMd5 as Base64String + ) + } + if ( - newConfig.isPublished && + fullConfig.isPublished && (!existingConfig || !existingConfig.isPublished) ) { // Newly published, set publication info @@ -610,9 +701,9 @@ const saveGrapher = async ( `UPDATE charts SET publishedAt=?, publishedByUserId=? WHERE id = ? `, [new Date(), user.id, chartId] ) - await triggerStaticBuild(user, `Publishing chart ${newConfig.slug}`) + await triggerStaticBuild(user, `Publishing chart ${fullConfig.slug}`) } else if ( - !newConfig.isPublished && + !fullConfig.isPublished && existingConfig && existingConfig.isPublished ) { @@ -622,13 +713,17 @@ const saveGrapher = async ( `DELETE FROM chart_slug_redirects WHERE chart_id = ?`, [existingConfig.id] ) - await triggerStaticBuild(user, `Unpublishing chart ${newConfig.slug}`) - } else if (newConfig.isPublished) - await triggerStaticBuild(user, `Updating chart ${newConfig.slug}`) + await deleteGrapherConfigFromR2( + R2GrapherConfigDirectory.publishedGrapherBySlug, + `${existingConfig.slug}.json` + ) + await triggerStaticBuild(user, `Unpublishing chart ${fullConfig.slug}`) + } else if (fullConfig.isPublished) + await triggerStaticBuild(user, `Updating chart ${fullConfig.slug}`) return { chartId, - savedPatch: newConfig, + savedPatch: patchConfig, } } @@ -1010,11 +1105,13 @@ deleteRouteWithRWTransaction( [chart.id] ) - const row = await db.knexRawFirst<{ configId: number }>( + const row = await db.knexRawFirst>( trx, `SELECT configId FROM charts WHERE id = ?`, [chart.id] ) + if (!row || !row.configId) + throw new JsonError(`No chart config found for id ${chart.id}`, 404) if (row) { await db.knexRaw(trx, `DELETE FROM charts WHERE id=?`, [chart.id]) await db.knexRaw(trx, `DELETE FROM chart_configs WHERE id=?`, [ @@ -1028,6 +1125,13 @@ deleteRouteWithRWTransaction( `Deleting chart ${chart.slug}` ) + await deleteGrapherConfigFromR2ByUUID(row.configId) + if (chart.isPublished) + await deleteGrapherConfigFromR2( + R2GrapherConfigDirectory.publishedGrapherBySlug, + `${chart.slug}.json` + ) + return { success: true } } ) diff --git a/adminSiteServer/chartConfigR2Helpers.ts b/adminSiteServer/chartConfigR2Helpers.ts new file mode 100644 index 00000000000..ca32f8fc214 --- /dev/null +++ b/adminSiteServer/chartConfigR2Helpers.ts @@ -0,0 +1,155 @@ +import { + GRAPHER_CONFIG_R2_BUCKET, + GRAPHER_CONFIG_R2_BUCKET_PATH, + R2_ACCESS_KEY_ID, + R2_ENDPOINT, + R2_REGION, + R2_SECRET_ACCESS_KEY, +} from "../settings/serverSettings.js" +import { + DeleteObjectCommand, + DeleteObjectCommandInput, + PutObjectCommand, + PutObjectCommandInput, + S3Client, +} from "@aws-sdk/client-s3" +import { JsonError, lazy } from "@ourworldindata/utils" +import { logErrorAndMaybeSendToBugsnag } from "../serverUtils/errorLog.js" +import { Base64String } from "../serverUtils/serverUtil.js" + +export enum R2GrapherConfigDirectory { + byUUID = "config/by-uuid", + publishedGrapherBySlug = "grapher/by-slug", +} + +const getS3Client: () => S3Client = lazy( + () => + new S3Client({ + endpoint: R2_ENDPOINT, + forcePathStyle: false, + region: R2_REGION, + credentials: { + accessKeyId: R2_ACCESS_KEY_ID, + secretAccessKey: R2_SECRET_ACCESS_KEY, + }, + }) +) + +export async function saveGrapherConfigToR2ByUUID( + uuid: string, + chartConfigStringified: string, + configMd5FromDb: Base64String +) { + await saveGrapherConfigToR2( + chartConfigStringified, + R2GrapherConfigDirectory.byUUID, + `${uuid}.json`, + configMd5FromDb + ) +} + +export async function deleteGrapherConfigFromR2ByUUID(id: string) { + await deleteGrapherConfigFromR2( + R2GrapherConfigDirectory.byUUID, + `${id}.json` + ) +} + +export async function saveGrapherConfigToR2( + config_stringified: string, + directory: R2GrapherConfigDirectory, + filename: string, + configMd5FromDb: Base64String +) { + if (process.env.NODE_ENV === "test") { + console.log("Skipping saving grapher config to R2 in test environment") + return + } + if ( + GRAPHER_CONFIG_R2_BUCKET === undefined || + GRAPHER_CONFIG_R2_BUCKET_PATH === undefined + ) { + console.info( + "R2 bucket not configured, not storing grapher config to R2" + ) + return + } + try { + const s3Client = getS3Client() + + if (!GRAPHER_CONFIG_R2_BUCKET || !GRAPHER_CONFIG_R2_BUCKET_PATH) { + throw new Error("R2 bucket not configured") + } + + const bucket = GRAPHER_CONFIG_R2_BUCKET + const path = [GRAPHER_CONFIG_R2_BUCKET_PATH, directory, filename].join( + "/" + ) + + const MIMEType = "application/json" + + const params: PutObjectCommandInput = { + Bucket: bucket, + Key: path, + Body: config_stringified, + ContentType: MIMEType, + ContentMD5: configMd5FromDb, + } + + await s3Client.send(new PutObjectCommand(params)) + console.log( + `Successfully uploaded object: ${params.Bucket}/${params.Key}` + ) + } catch (err) { + await logErrorAndMaybeSendToBugsnag(err) + throw new JsonError( + `Failed to save the grapher config to R2. Inner error: ${err}` + ) + } +} + +export async function deleteGrapherConfigFromR2( + directory: R2GrapherConfigDirectory, + filename: string +) { + if (process.env.NODE_ENV === "test") { + console.log("Skipping saving grapher config to R2 in test environment") + return + } + if ( + GRAPHER_CONFIG_R2_BUCKET === undefined || + GRAPHER_CONFIG_R2_BUCKET_PATH === undefined + ) { + console.info( + "R2 bucket not configured, not deleting grapher config to R2" + ) + return + } + try { + const s3Client = getS3Client() + + if (!GRAPHER_CONFIG_R2_BUCKET || !GRAPHER_CONFIG_R2_BUCKET_PATH) { + throw new Error("R2 bucket not configured") + } + + const bucket = GRAPHER_CONFIG_R2_BUCKET + const path = [GRAPHER_CONFIG_R2_BUCKET_PATH, directory, filename].join( + "/" + ) + + const params: DeleteObjectCommandInput = { + Bucket: bucket, + Key: path, + } + + await s3Client.send(new DeleteObjectCommand(params)) + console.log( + `Successfully deleted object: ${params.Bucket}/${params.Key}` + ) + } catch (err) { + await logErrorAndMaybeSendToBugsnag(err) + throw new JsonError( + `Failed to delete the grapher config to R2 at ${directory}/${filename}. Inner error: ${err}` + ) + } +} diff --git a/db/migration/1722415645057-AddChartConfigHash.ts b/db/migration/1722415645057-AddChartConfigHash.ts new file mode 100644 index 00000000000..e6dcb5acfc7 --- /dev/null +++ b/db/migration/1722415645057-AddChartConfigHash.ts @@ -0,0 +1,17 @@ +import { MigrationInterface, QueryRunner } from "typeorm" + +export class AddChartConfigHash1722415645057 implements MigrationInterface { + public async up(queryRunner: QueryRunner): Promise { + await queryRunner.query(` + ALTER TABLE chart_configs + ADD COLUMN fullMd5 CHAR(24) GENERATED ALWAYS as (to_base64(unhex(md5(full)))) STORED NOT NULL; + `) + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(` + ALTER TABLE chart_configs + DROP COLUMN fullMd5; + `) + } +} diff --git a/db/model/Image.ts b/db/model/Image.ts index 60799fc9782..7042f529eca 100644 --- a/db/model/Image.ts +++ b/db/model/Image.ts @@ -21,10 +21,10 @@ import { } from "@ourworldindata/utils" import { OwidGoogleAuth } from "../OwidGoogleAuth.js" import { - IMAGE_HOSTING_R2_ENDPOINT, - IMAGE_HOSTING_R2_ACCESS_KEY_ID, - IMAGE_HOSTING_R2_SECRET_ACCESS_KEY, - IMAGE_HOSTING_R2_REGION, + R2_ENDPOINT, + R2_ACCESS_KEY_ID, + R2_SECRET_ACCESS_KEY, + R2_REGION, IMAGE_HOSTING_R2_BUCKET_PATH, GDOCS_CLIENT_EMAIL, GDOCS_SHARED_DRIVE_ID, @@ -139,12 +139,12 @@ class ImageStore { export const imageStore = new ImageStore() export const s3Client = new S3Client({ - endpoint: IMAGE_HOSTING_R2_ENDPOINT, + endpoint: R2_ENDPOINT, forcePathStyle: false, - region: IMAGE_HOSTING_R2_REGION, + region: R2_REGION, credentials: { - accessKeyId: IMAGE_HOSTING_R2_ACCESS_KEY_ID, - secretAccessKey: IMAGE_HOSTING_R2_SECRET_ACCESS_KEY, + accessKeyId: R2_ACCESS_KEY_ID, + secretAccessKey: R2_SECRET_ACCESS_KEY, }, }) diff --git a/devTools/syncGraphersToR2/README.md b/devTools/syncGraphersToR2/README.md new file mode 100644 index 00000000000..cf136591bef --- /dev/null +++ b/devTools/syncGraphersToR2/README.md @@ -0,0 +1,10 @@ +This script synchronizes the chart configs in the chart_configs table into the R2 bucket. The R2 bucket information is taken from the server settings, i.e. they come from the .env file. + +Your R2 access key needs permission to access the right bucket (`owid-grapher-configs-staging`) and the following .env settings should be set for the sync script to work : + +``` +R2_ACCESS_KEY_ID +R2_SECRET_ACCESS_KEY +GRAPHER_CONFIG_R2_BUCKET +GRAPHER_CONFIG_R2_BUCKET_PATH +``` diff --git a/devTools/syncGraphersToR2/syncGraphersToR2.ts b/devTools/syncGraphersToR2/syncGraphersToR2.ts new file mode 100644 index 00000000000..75c71543610 --- /dev/null +++ b/devTools/syncGraphersToR2/syncGraphersToR2.ts @@ -0,0 +1,317 @@ +import parseArgs from "minimist" +import { + DeleteObjectCommand, + DeleteObjectCommandInput, + ListObjectsV2Command, + ListObjectsV2CommandOutput, + PutObjectCommand, + PutObjectCommandInput, + S3Client, +} from "@aws-sdk/client-s3" +import pMap from "p-map" +import { + GRAPHER_CONFIG_R2_BUCKET, + GRAPHER_CONFIG_R2_BUCKET_PATH, + R2_ACCESS_KEY_ID, + R2_ENDPOINT, + R2_REGION, + R2_SECRET_ACCESS_KEY, +} from "../../settings/serverSettings.js" +import { + knexRaw, + KnexReadonlyTransaction, + knexReadonlyTransaction, +} from "../../db/db.js" +import { R2GrapherConfigDirectory } from "../../adminSiteServer/chartConfigR2Helpers.js" +import { DbRawChartConfig, excludeUndefined } from "@ourworldindata/utils" +import { chunk } from "lodash" +import ProgressBar from "progress" +import { + bytesToBase64, + HexString, + hexToBytes, +} from "../../serverUtils/serverUtil.js" + +type HashAndId = Pick + +/** S3 list operations return at most 1000 items at a time. If there are + more results, the NextContinueationToken is set and you have to perform + another request using this token. This function abstracts over this implementation + detail and executes a given function over all items */ +async function listS3ObjectsAndPerformAction( + s3Client: S3Client, + pathPrefix: string, + action: (object: { Key?: string; ETag?: string }) => void +) { + let continuationToken: string | undefined = undefined + do { + const listObjectsCommandInput = { + Bucket: GRAPHER_CONFIG_R2_BUCKET, + Prefix: pathPrefix, + ContinuationToken: continuationToken, + } + const listObjectsCommandOutput: ListObjectsV2CommandOutput = + await s3Client.send( + new ListObjectsV2Command(listObjectsCommandInput) + ) + if ((listObjectsCommandOutput.Contents?.length ?? 0) > 0) { + listObjectsCommandOutput.Contents!.forEach((object) => { + action(object) + }) + } + continuationToken = listObjectsCommandOutput.NextContinuationToken + } while (continuationToken) +} + +/** Sync a set of chart configs with R2. Pass in a map of the keys to their md5 hashes and UUIDs + and this function will upsert all missing/outdated ones and delete any that are no longer needed. + + @param s3Client The S3 client to use + @param pathPrefix The path prefix to use for the files (e.g. "config/by-uuid" then everything inside it will be synced) + @param hashesOfFilesToToUpsert A map of the keys to their md5 hashes and UUIDs + @param trx The transaction to use for querying the DB for full configs + @param dryRun Whether to actually make changes to R2 or just log what would + */ +async function syncWithR2( + s3Client: S3Client, + pathPrefix: string, + hashesOfFilesToToUpsert: Map, + trx: KnexReadonlyTransaction, + dryRun: boolean = false +) { + // We'll first get all the files in the R2 bucket under the path prefix + // and check if the hash of each file that exist in R2 matches the hash + // of the file we want to upsert. If it does, we'll remove it from the + // list of files to upsert. If it doesn't, we'll add it to the list of + // files to delete. + + const hashesOfFilesToDelete = new Map() + + // Usage: + await listS3ObjectsAndPerformAction(s3Client, pathPrefix, (object) => { + if (object && object.Key && object.ETag) { + const md5 = object.ETag.replace(/(W\/)?"/g, "") as HexString + const md5Base64 = bytesToBase64(hexToBytes(md5)) + + if (hashesOfFilesToToUpsert.has(object.Key)) { + // If the file in R2 exists and has the same hash as the one + // in list of files to upsert then we don't need to upsert it + // so we delete it from that map + if ( + hashesOfFilesToToUpsert.get(object.Key)?.fullMd5 === + md5Base64 + ) { + hashesOfFilesToToUpsert.delete(object.Key) + } + // If the existing full config in R2 is different then + // we just keep the hashesOfFilesToToUpsert entry around + // which will upsert the new full config later on + } else { + // if the file in R2 is not in the list of files to upsert + // then we should delete it + hashesOfFilesToDelete.set(object.Key, md5Base64) + } + } + }) + + console.log("Number of files to upsert", hashesOfFilesToToUpsert.size) + console.log("Number of files to delete", hashesOfFilesToDelete.size) + + let progressBar = new ProgressBar( + "--- Deleting obsolete configs [:bar] :current/:total :elapseds\n", + { + total: hashesOfFilesToDelete.size, + } + ) + + // Delete the files in R2 that are no longer needed + await pMap( + [...hashesOfFilesToDelete.keys()], + async (key) => { + const deleteObjectCommandInput: DeleteObjectCommandInput = { + Bucket: GRAPHER_CONFIG_R2_BUCKET, + Key: key, + } + if (!dryRun) { + await s3Client.send( + new DeleteObjectCommand(deleteObjectCommandInput) + ) + } else { + console.log("Would have deleted", key) + } + progressBar.tick() + }, + { concurrency: 20 } + ) + + console.log("Finished deletes") + + progressBar = new ProgressBar( + "--- Storing missing configs [:bar] :current/:total :elapseds\n", + { + total: hashesOfFilesToToUpsert.size, + } + ) + + const errors = [] + + // Chunk the inserts so that we don't need to keep all the full configs in memory + for (const batch of chunk([...hashesOfFilesToToUpsert.entries()], 1000)) { + // Get the full configs for the batch + const fullConfigs = await knexRaw< + Pick + >(trx, `select id, full from chart_configs where id in (?)`, [ + batch.map((entry) => entry[1].id), + ]) + const fullConfigMap = new Map( + fullConfigs.map(({ id, full }) => [id, full]) + ) + + // Upload the full configs to R2 in parallel + const uploadPromises = batch.map(async ([key, val]) => { + const id = val.id + const fullMd5 = val.fullMd5 + const full = fullConfigMap.get(id) + if (full === undefined) { + return Promise.reject( + new Error(`Full config not found for id ${id}`) + ) + } + const putObjectCommandInput: PutObjectCommandInput = { + Bucket: GRAPHER_CONFIG_R2_BUCKET, + Key: key, + Body: full, + ContentMD5: fullMd5, + ContentType: "application/json", + } + if (!dryRun) + await s3Client.send( + new PutObjectCommand(putObjectCommandInput), + { + requestTimeout: 10_000, // timeout of 10s + } + ) + else console.log("Would have upserted", key) + progressBar.tick() + return + }) + const promiseResults = await Promise.allSettled(uploadPromises) + const batchErrors = promiseResults + .filter((result) => result.status === "rejected") + .map((result) => result.reason) + errors.push(...batchErrors) + } + + console.log("Finished upserts") + if (errors.length > 0) { + console.error(`${errors.length} Errors during upserts`) + for (const error of errors) { + console.error(error) + } + } +} + +async function main(parsedArgs: parseArgs.ParsedArgs, dryRun: boolean) { + if ( + GRAPHER_CONFIG_R2_BUCKET === undefined || + GRAPHER_CONFIG_R2_BUCKET_PATH === undefined + ) { + console.info("R2 bucket not configured, exiting") + return + } + + const s3Client = new S3Client({ + endpoint: R2_ENDPOINT, + forcePathStyle: false, + region: R2_REGION, + credentials: { + accessKeyId: R2_ACCESS_KEY_ID, + secretAccessKey: R2_SECRET_ACCESS_KEY, + }, + }) + + const hashesOfFilesToToUpsertBySlug = new Map() + const hashesOfFilesToToUpsertByUuid = new Map() + const pathPrefixBySlug = excludeUndefined([ + GRAPHER_CONFIG_R2_BUCKET_PATH, + R2GrapherConfigDirectory.publishedGrapherBySlug, + ]).join("/") + + const pathPrefixByUuid = excludeUndefined([ + GRAPHER_CONFIG_R2_BUCKET_PATH, + R2GrapherConfigDirectory.byUUID, + ]).join("/") + + await knexReadonlyTransaction(async (trx) => { + console.log("Syncing published charts") + // Sync charts published by slug + const slugsAndHashesFromDb = await knexRaw< + Pick + >( + trx, + `select slug, fullMd5, id + from chart_configs + where slug is not null + and full ->> '$.isPublished' = "true"` + ) + console.log(`Found ${slugsAndHashesFromDb.length} published charts`) + + slugsAndHashesFromDb.forEach((row) => { + hashesOfFilesToToUpsertBySlug.set( + `${pathPrefixBySlug}/${row.slug}.json`, + { + fullMd5: row.fullMd5, + id: row.id, + } + ) + }) + + await syncWithR2( + s3Client, + pathPrefixBySlug, + hashesOfFilesToToUpsertBySlug, + trx, + dryRun + ) + + console.log("Syncing chart configs by UUID") + + // Sync charts by UUID + const slugsAndHashesFromDbByUuid = await knexRaw< + Pick + >(trx, `select fullMd5, id from chart_configs`) + + console.log(`Found ${slugsAndHashesFromDbByUuid.length} charts by UUID`) + + slugsAndHashesFromDbByUuid.forEach((row) => { + hashesOfFilesToToUpsertByUuid.set( + `${pathPrefixByUuid}/${row.id}.json`, + { + fullMd5: row.fullMd5, + id: row.id, + } + ) + }) + + await syncWithR2( + s3Client, + pathPrefixByUuid, + hashesOfFilesToToUpsertByUuid, + trx, + dryRun + ) + console.log("Finished syncing") + }) + process.exit(0) +} + +const parsedArgs = parseArgs(process.argv.slice(2)) +if (parsedArgs["h"]) { + console.log( + `syncGraphersToR2.js - sync grapher configs from the chart_configs table to R2 + +--dry-run: Don't make any actual changes to R2` + ) +} else { + main(parsedArgs, parsedArgs["dry-run"]) +} diff --git a/devTools/syncGraphersToR2/tsconfig.json b/devTools/syncGraphersToR2/tsconfig.json new file mode 100644 index 00000000000..74f2eaadbb6 --- /dev/null +++ b/devTools/syncGraphersToR2/tsconfig.json @@ -0,0 +1,18 @@ +{ + "extends": "../tsconfigs/tsconfig.base.json", + "compilerOptions": { + "outDir": "../../itsJustJavascript/devTools/syncGrapherToR2", + "rootDir": "." + }, + "references": [ + { + "path": "../../db" + }, + { + "path": "../../adminSiteServer" + }, + { + "path": "../../settings" + } + ] +} diff --git a/package.json b/package.json index ce6401ed3a2..7ab8f15ca1b 100644 --- a/package.json +++ b/package.json @@ -39,7 +39,8 @@ "testPrettierAll": "yarn prettier --check \"**/*.{tsx,ts,jsx,js,json,md,html,css,scss,yml}\"", "testJest": "lerna run buildTests && jest", "testSiteNavigation": "tsx --tsconfig tsconfig.tsx.json devTools/navigationTest/navigationTest.ts", - "generateDbTypes": "npx @rmp135/sql-ts -c db/sql-ts/sql-ts-config.json" + "generateDbTypes": "npx @rmp135/sql-ts -c db/sql-ts/sql-ts-config.json", + "syncGraphersToR2": "tsx --tsconfig tsconfig.tsx.json devTools/syncGraphersToR2/syncGraphersToR2.ts" }, "dependencies": { "@algolia/autocomplete-js": "^1.17.2", diff --git a/packages/@ourworldindata/types/src/NominalType.ts b/packages/@ourworldindata/types/src/NominalType.ts index f3487f54232..f24497dfb29 100644 --- a/packages/@ourworldindata/types/src/NominalType.ts +++ b/packages/@ourworldindata/types/src/NominalType.ts @@ -20,3 +20,11 @@ declare const __nominal__type: unique symbol export type Nominal = Type & { readonly [__nominal__type]: Identifier } + +export function wrap(obj: T): Nominal { + return obj as Nominal +} + +export function unwrap(obj: Nominal): T { + return obj +} diff --git a/packages/@ourworldindata/types/src/dbTypes/ChartConfigs.ts b/packages/@ourworldindata/types/src/dbTypes/ChartConfigs.ts index 24c98ee6b6f..b1db3c1e82b 100644 --- a/packages/@ourworldindata/types/src/dbTypes/ChartConfigs.ts +++ b/packages/@ourworldindata/types/src/dbTypes/ChartConfigs.ts @@ -6,6 +6,7 @@ export interface DbInsertChartConfig { id: string patch: JsonString full: JsonString + fullMd5?: string slug?: string | null createdAt?: Date updatedAt?: Date | null diff --git a/packages/@ourworldindata/types/src/index.ts b/packages/@ourworldindata/types/src/index.ts index f75df1bca72..a88b15903b5 100644 --- a/packages/@ourworldindata/types/src/index.ts +++ b/packages/@ourworldindata/types/src/index.ts @@ -648,7 +648,7 @@ export { export { RedirectCode, type DbPlainRedirect } from "./dbTypes/Redirects.js" -export type { Nominal } from "./NominalType.js" +export { type Nominal, wrap, unwrap } from "./NominalType.js" export { type DbRawLatestWork, diff --git a/serverUtils/serverUtil.test.ts b/serverUtils/serverUtil.test.ts new file mode 100644 index 00000000000..50619b7852a --- /dev/null +++ b/serverUtils/serverUtil.test.ts @@ -0,0 +1,26 @@ +import { + base64ToBytes, + bytesToBase64, + bytesToHex, + hexToBytes, +} from "./serverUtil.js" +import crypto from "crypto" + +function generateRandomBytes(length: number): Uint8Array { + return crypto.randomBytes(length) +} + +describe("hex/base64 conversion is reversible", () => { + const originalBytes = generateRandomBytes(33) + const base64String = bytesToBase64(originalBytes) + const roundTrippedBytes = base64ToBytes(base64String) + it("is the same after converting to base64 and back", () => { + expect(originalBytes).toEqual(roundTrippedBytes) + }) + + const hexString = bytesToHex(originalBytes) + const roundTrippedBytesHex = hexToBytes(hexString) + it("is the same after converting to hex and back", () => { + expect(originalBytes).toEqual(roundTrippedBytesHex) + }) +}) diff --git a/serverUtils/serverUtil.tsx b/serverUtils/serverUtil.tsx index 812c11bb504..13df7aa9af5 100644 --- a/serverUtils/serverUtil.tsx +++ b/serverUtils/serverUtil.tsx @@ -1,6 +1,6 @@ import ReactDOMServer from "react-dom/server.js" import * as lodash from "lodash" -import { JsonError } from "@ourworldindata/utils" +import { JsonError, Nominal } from "@ourworldindata/utils" // Fail-fast integer conversion, for e.g. ids in url params export const expectInt = (value: any): number => { @@ -17,3 +17,22 @@ export const renderToHtmlPage = (element: any) => // Determine if input is suitable for use as a url slug export const isValidSlug = (slug: any) => lodash.isString(slug) && slug.length > 1 && slug.match(/^[\w-]+$/) + +export type Base64String = Nominal +export type HexString = Nominal + +export function base64ToBytes(base64: Base64String): Uint8Array { + return Buffer.from(base64, "base64") +} + +export function bytesToBase64(bytes: Uint8Array): Base64String { + return Buffer.from(bytes).toString("base64") as Base64String +} + +export function hexToBytes(hex: string): Uint8Array { + return Buffer.from(hex, "hex") +} + +export function bytesToHex(bytes: Uint8Array): HexString { + return Buffer.from(bytes).toString("hex") as HexString +} diff --git a/settings/serverSettings.ts b/settings/serverSettings.ts index 02771ed1b3c..d2e76d4da8e 100644 --- a/settings/serverSettings.ts +++ b/settings/serverSettings.ts @@ -154,22 +154,25 @@ export const IMAGE_HOSTING_R2_BUCKET_SUBFOLDER_PATH: string = IMAGE_HOSTING_R2_BUCKET_PATH.indexOf("/") + 1 ) // extract R2 credentials from rclone config as defaults -export const IMAGE_HOSTING_R2_ENDPOINT: string = - serverSettings.IMAGE_HOSTING_R2_ENDPOINT || +export const R2_ENDPOINT: string = + serverSettings.R2_ENDPOINT || rcloneConfig["owid-r2"]?.endpoint || "https://078fcdfed9955087315dd86792e71a7e.r2.cloudflarestorage.com" -export const IMAGE_HOSTING_R2_ACCESS_KEY_ID: string = - serverSettings.IMAGE_HOSTING_R2_ACCESS_KEY_ID || +export const R2_ACCESS_KEY_ID: string = + serverSettings.R2_ACCESS_KEY_ID || rcloneConfig["owid-r2"]?.access_key_id || "" -export const IMAGE_HOSTING_R2_SECRET_ACCESS_KEY: string = - serverSettings.IMAGE_HOSTING_R2_SECRET_ACCESS_KEY || +export const R2_SECRET_ACCESS_KEY: string = + serverSettings.R2_SECRET_ACCESS_KEY || rcloneConfig["owid-r2"]?.secret_access_key || "" -export const IMAGE_HOSTING_R2_REGION: string = - serverSettings.IMAGE_HOSTING_R2_REGION || - rcloneConfig["owid-r2"]?.region || - "auto" +export const R2_REGION: string = + serverSettings.R2_REGION || rcloneConfig["owid-r2"]?.region || "auto" + +export const GRAPHER_CONFIG_R2_BUCKET: string | undefined = + serverSettings.GRAPHER_CONFIG_R2_BUCKET +export const GRAPHER_CONFIG_R2_BUCKET_PATH: string | undefined = + serverSettings.GRAPHER_CONFIG_R2_BUCKET_PATH export const DATA_API_URL: string = clientSettings.DATA_API_URL diff --git a/site/gdocs/components/Image.tsx b/site/gdocs/components/Image.tsx index 57e58bbb967..3aceb05b773 100644 --- a/site/gdocs/components/Image.tsx +++ b/site/gdocs/components/Image.tsx @@ -115,9 +115,7 @@ export default function Image(props: { if (isPreviewing) { const makePreviewUrl = (f: string) => - `${IMAGE_HOSTING_R2_CDN_URL}/${IMAGE_HOSTING_R2_BUCKET_SUBFOLDER_PATH}/${encodeURIComponent( - f - )}` + `${IMAGE_HOSTING_R2_CDN_URL}/${IMAGE_HOSTING_R2_BUCKET_SUBFOLDER_PATH}/${encodeURIComponent(f)}` const PreviewSource = (props: { i?: ImageMetadata; sm?: boolean }) => { const { i, sm } = props diff --git a/tsconfig.json b/tsconfig.json index 94bbeed9aae..ae863d527a2 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -54,6 +54,9 @@ }, { "path": "./devTools/navigationTest" + }, + { + "path": "./devTools/syncGraphersToR2" } ] }