Skip to content

Commit

Permalink
🔨 rework md5 hashing to happen in the db since mysql
Browse files Browse the repository at this point in the history
rewrites json field order an whitespace and we need
to hash in a consistent way
  • Loading branch information
danyx23 committed Aug 23, 2024
1 parent b6b2299 commit bf13412
Showing 5 changed files with 59 additions and 39 deletions.
73 changes: 50 additions & 23 deletions adminSiteServer/apiRouter.ts
Original file line number Diff line number Diff line change
@@ -10,8 +10,8 @@ import {
DATA_API_URL,
} from "../settings/serverSettings.js"
import {
Base64String,
expectInt,
getMd5HashBase64,
isValidSlug,
} from "../serverUtils/serverUtil.js"
import {
@@ -303,23 +303,15 @@ const saveNewChart = async (
const fullConfig = mergeGrapherConfigs(parentConfig, patchConfig)
const fullConfigStringified = JSON.stringify(fullConfig)

// compute an MD5 hash of the full config
const fullConfigMd5 = await getMd5HashBase64(fullConfigStringified)

// insert patch & full configs into the chart_configs table
const chartConfigId = uuidv7()
await db.knexRaw(
knex,
`-- sql
INSERT INTO chart_configs (id, patch, full, fullMd5)
VALUES (?, ?, ?, ?)
INSERT INTO chart_configs (id, patch, full)
VALUES (?, ?, ?)
`,
[
chartConfigId,
JSON.stringify(patchConfig),
fullConfigStringified,
fullConfigMd5,
]
[chartConfigId, JSON.stringify(patchConfig), fullConfigStringified]
)

// add a new chart to the charts table
@@ -349,7 +341,23 @@ const saveNewChart = async (
[chartId, chartId, chartId]
)

await saveGrapherConfigToR2ByUUID(chartConfigId, fullConfigStringified)
// We need to get the full config and the md5 hash from the database instead of
// computing our own md5 hash because MySQL normalizes JSON and our
// client computed md5 would be different from the ones computed by and stored in R2
const fullConfigMd5 = await db.knexRawFirst<
Pick<DbRawChartConfig, "full" | "fullMd5">
>(
knex,
`-- sql
select full, fullMd5 from chart_configs where id = ?`,
[chartConfigId]
)

await saveGrapherConfigToR2ByUUID(
chartConfigId,
fullConfigMd5!.full,
fullConfigMd5!.fullMd5 as Base64String
)

return { patchConfig, fullConfig }
}
@@ -381,8 +389,6 @@ const updateExistingChart = async (
const fullConfig = mergeGrapherConfigs(parentConfig, patchConfig)
const fullConfigStringified = JSON.stringify(fullConfig)

const fullConfigMd5 = await getMd5HashBase64(fullConfigStringified)

const chartConfigId = await db.knexRawFirst<Pick<DbPlainChart, "configId">>(
knex,
`SELECT configId FROM charts WHERE id = ?`,
@@ -399,14 +405,12 @@ const updateExistingChart = async (
UPDATE chart_configs
SET
patch=?,
full=?,
fullMd5=?
full=?
WHERE id = ?
`,
[
JSON.stringify(patchConfig),
fullConfigStringified,
fullConfigMd5,
chartConfigId.configId,
]
)
@@ -422,9 +426,22 @@ const updateExistingChart = async (
[new Date(), user.id, chartId]
)

// We need to get the full config and the md5 hash from the database instead of
// computing our own md5 hash because MySQL normalizes JSON and our
// client computed md5 would be different from the ones computed by and stored in R2
const fullConfigMd5 = await db.knexRawFirst<
Pick<DbRawChartConfig, "full" | "fullMd5">
>(
knex,
`-- sql
select full, fullMd5 from chart_configs where id = ?`,
[chartConfigId]
)

await saveGrapherConfigToR2ByUUID(
chartConfigId.configId,
fullConfigStringified
fullConfigMd5!.full,
fullConfigMd5!.fullMd5 as Base64String
)

return { patchConfig, fullConfig }
@@ -581,13 +598,23 @@ const saveGrapher = async (
)

if (fullConfig.isPublished) {
const configStringified = JSON.stringify(fullConfig)
const configMd5 = await getMd5HashBase64(configStringified)
// We need to get the full config and the md5 hash from the database instead of
// computing our own md5 hash because MySQL normalizes JSON and our
// client computed md5 would be different from the ones computed by and stored in R2
const fullConfigMd5 = await db.knexRawFirst<
Pick<DbRawChartConfig, "full" | "fullMd5">
>(
knex,
`-- sql
select full, fullMd5 from chart_configs where id = ?`,
[]
)

await saveGrapherConfigToR2(
configStringified,
fullConfigMd5!.full,
R2GrapherConfigDirectory.publishedGrapherBySlug,
`${fullConfig.slug}.json`,
configMd5
fullConfigMd5!.fullMd5 as Base64String
)
}

13 changes: 6 additions & 7 deletions adminSiteServer/chartConfigR2Helpers.ts
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@ import {
} from "@aws-sdk/client-s3"
import { JsonError, lazy } from "@ourworldindata/utils"
import { logErrorAndMaybeSendToBugsnag } from "../serverUtils/errorLog.js"
import { getMd5HashBase64, Base64String } from "../serverUtils/serverUtil.js"
import { Base64String } from "../serverUtils/serverUtil.js"

export enum R2GrapherConfigDirectory {
byUUID = "config/by-uuid",
@@ -37,15 +37,14 @@ const getS3Client: () => S3Client = lazy(

export async function saveGrapherConfigToR2ByUUID(
id: string,
chartConfigStringified: string
chartConfigStringified: string,
configMd5FromDb: Base64String
) {
const configMd5 = await getMd5HashBase64(chartConfigStringified)

await saveGrapherConfigToR2(
chartConfigStringified,
R2GrapherConfigDirectory.byUUID,
`${id}.json`,
configMd5
configMd5FromDb
)
}

@@ -60,7 +59,7 @@ export async function saveGrapherConfigToR2(
config_stringified: string,
directory: R2GrapherConfigDirectory,
filename: string,
configMd5: Base64String
configMd5FromDb: Base64String
) {
if (process.env.NODE_ENV === "test") {
console.log("Skipping saving grapher config to R2 in test environment")
@@ -94,7 +93,7 @@ export async function saveGrapherConfigToR2(
Key: path,
Body: config_stringified,
ContentType: MIMEType,
ContentMD5: configMd5,
ContentMD5: configMd5FromDb,
}

await s3Client.send(new PutObjectCommand(params))
2 changes: 1 addition & 1 deletion db/migration/1722415645057-AddChartConfigHash.ts
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@ export class AddChartConfigHash1722415645057 implements MigrationInterface {
public async up(queryRunner: QueryRunner): Promise<void> {
await queryRunner.query(`
ALTER TABLE chart_configs
ADD COLUMN fullMd5 CHAR(24) DEFAULT to_base64(unhex(md5(full)));
ADD COLUMN fullMd5 CHAR(24) GENERATED ALWAYS as (to_base64(unhex(md5(full)))) STORED NOT NULL;
`)
}

3 changes: 2 additions & 1 deletion devTools/syncGraphersToR2/syncGraphersToR2.ts
Original file line number Diff line number Diff line change
@@ -90,7 +90,7 @@ async function syncWithR2(
// Usage:
await listS3ObjectsAndPerformAction(s3Client, pathPrefix, (object) => {
if (object && object.Key && object.ETag) {
const md5 = object.ETag.replace(/"/g, "") as HexString
const md5 = object.ETag.replace(/(W\/)?"/g, "") as HexString
const md5Base64 = bytesToBase64(hexToBytes(md5))

if (hashesOfFilesToToUpsert.has(object.Key)) {
@@ -290,6 +290,7 @@ async function main(parsedArgs: parseArgs.ParsedArgs, dryRun: boolean) {
dryRun
)
})
process.exit(0)
}

const parsedArgs = parseArgs(process.argv.slice(2))
7 changes: 0 additions & 7 deletions serverUtils/serverUtil.tsx
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import ReactDOMServer from "react-dom/server.js"
import * as lodash from "lodash"
import { JsonError, Nominal } from "@ourworldindata/utils"
import { createHash } from "crypto"

// Fail-fast integer conversion, for e.g. ids in url params
export const expectInt = (value: any): number => {
@@ -37,9 +36,3 @@ export function hexToBytes(hex: string): Uint8Array {
export function bytesToHex(bytes: Uint8Array): HexString {
return Buffer.from(bytes).toString("hex") as HexString
}

export function getMd5HashBase64(data: string): Base64String {
return createHash("md5")
.update(data, "utf-8")
.digest("base64") as Base64String
}

0 comments on commit bf13412

Please sign in to comment.