From 10112c8efbfac89c972889e22278f37fb7b9c0f4 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Wed, 21 Aug 2024 11:02:38 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A8=20rework=20md5=20hashing=20to=20ha?= =?UTF-8?q?ppen=20in=20the=20db=20since=20mysql=20rewrites=20json=20field?= =?UTF-8?q?=20order=20an=20whitespace=20and=20we=20need=20to=20hash=20in?= =?UTF-8?q?=20a=20consistent=20way?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- adminSiteServer/apiRouter.ts | 73 +++++++++++++------ adminSiteServer/chartConfigR2Helpers.ts | 13 ++-- .../1722415645057-AddChartConfigHash.ts | 2 +- devTools/syncGraphersToR2/syncGraphersToR2.ts | 3 +- serverUtils/serverUtil.tsx | 7 -- 5 files changed, 59 insertions(+), 39 deletions(-) diff --git a/adminSiteServer/apiRouter.ts b/adminSiteServer/apiRouter.ts index 3e8580f1e9f..d071f8f13b9 100644 --- a/adminSiteServer/apiRouter.ts +++ b/adminSiteServer/apiRouter.ts @@ -10,8 +10,8 @@ import { DATA_API_URL, } from "../settings/serverSettings.js" import { + Base64String, expectInt, - getMd5HashBase64, isValidSlug, } from "../serverUtils/serverUtil.js" import { @@ -326,23 +326,15 @@ const saveNewChart = async ( const fullConfig = mergeGrapherConfigs(fullParentConfig, patchConfig) const fullConfigStringified = JSON.stringify(fullConfig) - // compute an MD5 hash of the full config - const fullConfigMd5 = await getMd5HashBase64(fullConfigStringified) - // insert patch & full configs into the chart_configs table const chartConfigId = uuidv7() await db.knexRaw( knex, `-- sql - INSERT INTO chart_configs (id, patch, full, fullMd5) - VALUES (?, ?, ?, ?) + INSERT INTO chart_configs (id, patch, full) + VALUES (?, ?, ?) `, - [ - chartConfigId, - JSON.stringify(patchConfig), - fullConfigStringified, - fullConfigMd5, - ] + [chartConfigId, JSON.stringify(patchConfig), fullConfigStringified] ) // add a new chart to the charts table @@ -372,7 +364,23 @@ const saveNewChart = async ( [chartId, chartId, chartId] ) - await saveGrapherConfigToR2ByUUID(chartConfigId, fullConfigStringified) + // We need to get the full config and the md5 hash from the database instead of + // computing our own md5 hash because MySQL normalizes JSON and our + // client computed md5 would be different from the ones computed by and stored in R2 + const fullConfigMd5 = await db.knexRawFirst< + Pick + >( + knex, + `-- sql + select full, fullMd5 from chart_configs where id = ?`, + [chartConfigId] + ) + + await saveGrapherConfigToR2ByUUID( + chartConfigId, + fullConfigMd5!.full, + fullConfigMd5!.fullMd5 as Base64String + ) return { patchConfig, fullConfig } } @@ -410,8 +418,6 @@ const updateExistingChart = async ( const fullConfig = mergeGrapherConfigs(fullParentConfig, patchConfig) const fullConfigStringified = JSON.stringify(fullConfig) - const fullConfigMd5 = await getMd5HashBase64(fullConfigStringified) - const chartConfigId = await db.knexRawFirst>( knex, `SELECT configId FROM charts WHERE id = ?`, @@ -428,14 +434,12 @@ const updateExistingChart = async ( UPDATE chart_configs SET patch=?, - full=?, - fullMd5=? + full=? WHERE id = ? `, [ JSON.stringify(patchConfig), fullConfigStringified, - fullConfigMd5, chartConfigId.configId, ] ) @@ -451,9 +455,22 @@ const updateExistingChart = async ( [shouldInherit, new Date(), user.id, chartId] ) + // We need to get the full config and the md5 hash from the database instead of + // computing our own md5 hash because MySQL normalizes JSON and our + // client computed md5 would be different from the ones computed by and stored in R2 + const fullConfigMd5 = await db.knexRawFirst< + Pick + >( + knex, + `-- sql + select full, fullMd5 from chart_configs where id = ?`, + [chartConfigId] + ) + await saveGrapherConfigToR2ByUUID( chartConfigId.configId, - fullConfigStringified + fullConfigMd5!.full, + fullConfigMd5!.fullMd5 as Base64String ) return { patchConfig, fullConfig } @@ -634,13 +651,23 @@ const saveGrapher = async ( ) if (fullConfig.isPublished) { - const configStringified = JSON.stringify(fullConfig) - const configMd5 = await getMd5HashBase64(configStringified) + // We need to get the full config and the md5 hash from the database instead of + // computing our own md5 hash because MySQL normalizes JSON and our + // client computed md5 would be different from the ones computed by and stored in R2 + const fullConfigMd5 = await db.knexRawFirst< + Pick + >( + knex, + `-- sql + select full, fullMd5 from chart_configs where id = ?`, + [] + ) + await saveGrapherConfigToR2( - configStringified, + fullConfigMd5!.full, R2GrapherConfigDirectory.publishedGrapherBySlug, `${fullConfig.slug}.json`, - configMd5 + fullConfigMd5!.fullMd5 as Base64String ) } diff --git a/adminSiteServer/chartConfigR2Helpers.ts b/adminSiteServer/chartConfigR2Helpers.ts index 8768eac6382..b55a862d999 100644 --- a/adminSiteServer/chartConfigR2Helpers.ts +++ b/adminSiteServer/chartConfigR2Helpers.ts @@ -15,7 +15,7 @@ import { } from "@aws-sdk/client-s3" import { JsonError, lazy } from "@ourworldindata/utils" import { logErrorAndMaybeSendToBugsnag } from "../serverUtils/errorLog.js" -import { getMd5HashBase64, Base64String } from "../serverUtils/serverUtil.js" +import { Base64String } from "../serverUtils/serverUtil.js" export enum R2GrapherConfigDirectory { byUUID = "config/by-uuid", @@ -37,15 +37,14 @@ const getS3Client: () => S3Client = lazy( export async function saveGrapherConfigToR2ByUUID( id: string, - chartConfigStringified: string + chartConfigStringified: string, + configMd5FromDb: Base64String ) { - const configMd5 = await getMd5HashBase64(chartConfigStringified) - await saveGrapherConfigToR2( chartConfigStringified, R2GrapherConfigDirectory.byUUID, `${id}.json`, - configMd5 + configMd5FromDb ) } @@ -60,7 +59,7 @@ export async function saveGrapherConfigToR2( config_stringified: string, directory: R2GrapherConfigDirectory, filename: string, - configMd5: Base64String + configMd5FromDb: Base64String ) { if (process.env.NODE_ENV === "test") { console.log("Skipping saving grapher config to R2 in test environment") @@ -94,7 +93,7 @@ export async function saveGrapherConfigToR2( Key: path, Body: config_stringified, ContentType: MIMEType, - ContentMD5: configMd5, + ContentMD5: configMd5FromDb, } await s3Client.send(new PutObjectCommand(params)) diff --git a/db/migration/1722415645057-AddChartConfigHash.ts b/db/migration/1722415645057-AddChartConfigHash.ts index 287ab8b2d8d..e6dcb5acfc7 100644 --- a/db/migration/1722415645057-AddChartConfigHash.ts +++ b/db/migration/1722415645057-AddChartConfigHash.ts @@ -4,7 +4,7 @@ export class AddChartConfigHash1722415645057 implements MigrationInterface { public async up(queryRunner: QueryRunner): Promise { await queryRunner.query(` ALTER TABLE chart_configs - ADD COLUMN fullMd5 CHAR(24) DEFAULT to_base64(unhex(md5(full))); + ADD COLUMN fullMd5 CHAR(24) GENERATED ALWAYS as (to_base64(unhex(md5(full)))) STORED NOT NULL; `) } diff --git a/devTools/syncGraphersToR2/syncGraphersToR2.ts b/devTools/syncGraphersToR2/syncGraphersToR2.ts index 486b64fd1e4..fb3e8cae1cb 100644 --- a/devTools/syncGraphersToR2/syncGraphersToR2.ts +++ b/devTools/syncGraphersToR2/syncGraphersToR2.ts @@ -90,7 +90,7 @@ async function syncWithR2( // Usage: await listS3ObjectsAndPerformAction(s3Client, pathPrefix, (object) => { if (object && object.Key && object.ETag) { - const md5 = object.ETag.replace(/"/g, "") as HexString + const md5 = object.ETag.replace(/(W\/)?"/g, "") as HexString const md5Base64 = bytesToBase64(hexToBytes(md5)) if (hashesOfFilesToToUpsert.has(object.Key)) { @@ -290,6 +290,7 @@ async function main(parsedArgs: parseArgs.ParsedArgs, dryRun: boolean) { dryRun ) }) + process.exit(0) } const parsedArgs = parseArgs(process.argv.slice(2)) diff --git a/serverUtils/serverUtil.tsx b/serverUtils/serverUtil.tsx index f9a3fa32249..13df7aa9af5 100644 --- a/serverUtils/serverUtil.tsx +++ b/serverUtils/serverUtil.tsx @@ -1,7 +1,6 @@ import ReactDOMServer from "react-dom/server.js" import * as lodash from "lodash" import { JsonError, Nominal } from "@ourworldindata/utils" -import { createHash } from "crypto" // Fail-fast integer conversion, for e.g. ids in url params export const expectInt = (value: any): number => { @@ -37,9 +36,3 @@ export function hexToBytes(hex: string): Uint8Array { export function bytesToHex(bytes: Uint8Array): HexString { return Buffer.from(bytes).toString("hex") as HexString } - -export function getMd5HashBase64(data: string): Base64String { - return createHash("md5") - .update(data, "utf-8") - .digest("base64") as Base64String -}