From aeb9e71ee778850542dedd9340149dc61d9107d5 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Tue, 1 Oct 2024 16:34:03 -0500
Subject: [PATCH] test: add efps to CI  (#7556)

* test: revamp eFPS suite

* chore: add efps github actions

* chore: use different token for efps tests

* test: add comment with perf report result

* test: keep playwright install script

* feat: add workflow dispatch inputs

* fix: use `requestAnimationFrame` instead

---------

Co-authored-by: Binoy Patel <me@binoy.io>
---
 .github/workflows/efps.yml                    |  95 +++++
 package.json                                  |   1 +
 ...atePercentile.ts => aggregateLatencies.ts} |  13 +-
 perf/efps/helpers/measureBlockingTime.ts      |  47 +++
 perf/efps/helpers/measureFpsForInput.ts       |  41 +-
 perf/efps/helpers/measureFpsForPte.ts         |  38 +-
 perf/efps/index.ts                            | 380 ++++++++++++++++--
 perf/efps/runTest.ts                          |  70 ++--
 perf/efps/tests/article/article.ts            |  32 +-
 perf/efps/tests/recipe/recipe.ts              |  23 +-
 perf/efps/tests/synthetic/synthetic.ts        |  24 +-
 perf/efps/types.ts                            |  21 +-
 12 files changed, 624 insertions(+), 161 deletions(-)
 create mode 100644 .github/workflows/efps.yml
 rename perf/efps/helpers/{calculatePercentile.ts => aggregateLatencies.ts} (61%)
 create mode 100644 perf/efps/helpers/measureBlockingTime.ts

diff --git a/.github/workflows/efps.yml b/.github/workflows/efps.yml
new file mode 100644
index 00000000000..d46e9c6d557
--- /dev/null
+++ b/.github/workflows/efps.yml
@@ -0,0 +1,95 @@
+name: eFPS Test
+on:
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      reference_tag:
+        description: "Reference tag for comparison"
+        required: true
+        default: "latest"
+      enable_profiler:
+        description: "Enable profiler"
+        required: true
+        type: boolean
+        default: false
+
+jobs:
+  install:
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+    env:
+      TURBO_TOKEN: ${{ secrets.TURBO_TOKEN }}
+      TURBO_TEAM: ${{ vars.TURBO_TEAM }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 18
+
+      - uses: pnpm/action-setup@v4
+        name: Install pnpm
+        id: pnpm-install
+        with:
+          run_install: false
+
+      - name: Get pnpm store directory
+        id: pnpm-cache
+        shell: bash
+        run: |
+          echo "STORE_PATH=$(pnpm store path)" >> $GITHUB_OUTPUT
+
+      - name: Cache node modules
+        id: cache-node-modules
+        uses: actions/cache@v4
+        env:
+          cache-name: cache-node-modules
+        with:
+          path: ${{ steps.pnpm-cache.outputs.STORE_PATH }}
+          key: ${{ runner.os }}-pnpm-store-${{ env.cache-name }}-${{ hashFiles('**/pnpm-lock.yaml') }}
+          restore-keys: |
+            v1-${{ runner.os }}-pnpm-store-${{ env.cache-name }}-
+            v1-${{ runner.os }}-pnpm-store-
+            v1-${{ runner.os }}-
+
+      - name: Install project dependencies
+        run: pnpm install
+
+      - name: Store Playwright's Version
+        run: |
+          PLAYWRIGHT_VERSION=$(npx playwright --version | sed 's/Version //')
+          echo "Playwright's Version: $PLAYWRIGHT_VERSION"
+          echo "PLAYWRIGHT_VERSION=$PLAYWRIGHT_VERSION" >> $GITHUB_ENV
+
+      - name: Cache Playwright Browsers for Playwright's Version
+        id: cache-playwright-browsers
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: playwright-browsers-${{ env.PLAYWRIGHT_VERSION }}
+
+      - name: Install Playwright Browsers
+        if: steps.cache-playwright-browsers.outputs.cache-hit != 'true'
+        run: npx playwright install --with-deps
+
+      - name: Run eFPS tests
+        env:
+          VITE_PERF_EFPS_PROJECT_ID: ${{ secrets.PERF_EFPS_PROJECT_ID }}
+          VITE_PERF_EFPS_DATASET: ${{ secrets.PERF_EFPS_DATASET }}
+          PERF_EFPS_SANITY_TOKEN: ${{ secrets.PERF_EFPS_SANITY_TOKEN }}
+          REFERENCE_TAG: ${{ github.event.inputs.reference_tag || 'latest' }}
+          ENABLE_PROFILER: ${{ github.event.inputs.enable_profiler || false }}
+        run: pnpm efps:test
+
+      - name: PR comment with report
+        uses: thollander/actions-comment-pull-request@fabd468d3a1a0b97feee5f6b9e499eab0dd903f6 # v2
+        if: ${{ github.event_name == 'pull_request' }}
+        with:
+          comment_tag: "efps-report"
+          filePath: ${{ github.workspace }}/perf/efps/results/benchmark-results.md
+
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: efps-report
+          path: perf/efps/results
+          retention-days: 30
diff --git a/package.json b/package.json
index e133a523e10..fe495176c90 100644
--- a/package.json
+++ b/package.json
@@ -54,6 +54,7 @@
     "e2e:setup": "node -r dotenv-flow/config -r esbuild-register scripts/e2e/setup",
     "e2e:start": "pnpm --filter studio-e2e-testing preview",
     "etl": "node -r dotenv-flow/config -r esbuild-register scripts/etl",
+    "efps:test": "cd perf/efps && pnpm test",
     "example:blog-studio": "cd examples/blog-studio && pnpm start",
     "example:clean-studio": "cd examples/blog-studio && pnpm start",
     "example:ecommerce-studio": "cd examples/blog-studio && pnpm start",
diff --git a/perf/efps/helpers/calculatePercentile.ts b/perf/efps/helpers/aggregateLatencies.ts
similarity index 61%
rename from perf/efps/helpers/calculatePercentile.ts
rename to perf/efps/helpers/aggregateLatencies.ts
index 54acbded118..e3f82202a77 100644
--- a/perf/efps/helpers/calculatePercentile.ts
+++ b/perf/efps/helpers/aggregateLatencies.ts
@@ -1,4 +1,6 @@
-export function calculatePercentile(numbers: number[], percentile: number): number {
+import {type EfpsResult} from '../types'
+
+function calculatePercentile(numbers: number[], percentile: number): number {
   // Sort the array in ascending order
   const sorted = numbers.slice().sort((a, b) => a - b)
 
@@ -19,3 +21,12 @@ export function calculatePercentile(numbers: number[], percentile: number): numb
   const fraction = index - lowerIndex
   return lowerValue + (upperValue - lowerValue) * fraction
 }
+
+export function aggregateLatencies(values: number[]): EfpsResult['latency'] {
+  return {
+    p50: calculatePercentile(values, 0.5),
+    p75: calculatePercentile(values, 0.75),
+    p90: calculatePercentile(values, 0.9),
+    p99: calculatePercentile(values, 0.99),
+  }
+}
diff --git a/perf/efps/helpers/measureBlockingTime.ts b/perf/efps/helpers/measureBlockingTime.ts
new file mode 100644
index 00000000000..4208c69738f
--- /dev/null
+++ b/perf/efps/helpers/measureBlockingTime.ts
@@ -0,0 +1,47 @@
+import {type Page} from 'playwright'
+
+const BLOCKING_TASK_THRESHOLD = 50
+
+export function measureBlockingTime(page: Page): () => Promise<number> {
+  const idleGapLengthsPromise = page.evaluate(async () => {
+    const idleGapLengths: number[] = []
+    const done = false
+    let last = performance.now()
+
+    const handler = () => {
+      const current = performance.now()
+      idleGapLengths.push(current - last)
+      last = current
+
+      if (done) return
+      requestAnimationFrame(handler)
+    }
+
+    requestAnimationFrame(handler)
+
+    await new Promise((resolve) => {
+      document.addEventListener('__blockingTimeFinish', resolve, {once: true})
+    })
+
+    return idleGapLengths
+  })
+
+  async function getBlockingTime() {
+    await page.evaluate(() => {
+      document.dispatchEvent(new CustomEvent('__blockingTimeFinish'))
+    })
+
+    const idleGapLengths = await idleGapLengthsPromise
+
+    const blockingTime = idleGapLengths
+      // only consider the gap lengths that are blocking
+      .filter((idleGapLength) => idleGapLength > BLOCKING_TASK_THRESHOLD)
+      // subtract the allowed time so we're only left with blocking time
+      .map((idleGapLength) => idleGapLength - BLOCKING_TASK_THRESHOLD)
+      .reduce((sum, next) => sum + next, 0)
+
+    return blockingTime
+  }
+
+  return getBlockingTime
+}
diff --git a/perf/efps/helpers/measureFpsForInput.ts b/perf/efps/helpers/measureFpsForInput.ts
index 4b0c584c795..22310a92fb2 100644
--- a/perf/efps/helpers/measureFpsForInput.ts
+++ b/perf/efps/helpers/measureFpsForInput.ts
@@ -1,9 +1,28 @@
-import {type Locator} from 'playwright'
+import {type Page} from 'playwright'
 
 import {type EfpsResult} from '../types'
-import {calculatePercentile} from './calculatePercentile'
+import {aggregateLatencies} from './aggregateLatencies'
+import {measureBlockingTime} from './measureBlockingTime'
 
-export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
+interface MeasureFpsForInputOptions {
+  label?: string
+  page: Page
+  fieldName: string
+}
+
+export async function measureFpsForInput({
+  label,
+  fieldName,
+  page,
+}: MeasureFpsForInputOptions): Promise<EfpsResult> {
+  const start = Date.now()
+
+  const input = page
+    .locator(
+      `[data-testid="field-${fieldName}"] input[type="text"], ` +
+        `[data-testid="field-${fieldName}"] textarea`,
+    )
+    .first()
   await input.waitFor({state: 'visible'})
   const characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 
@@ -49,6 +68,8 @@ export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
   await input.pressSequentially(startingMarker)
   await new Promise((resolve) => setTimeout(resolve, 500))
 
+  const getBlockingTime = measureBlockingTime(page)
+
   for (const character of characters) {
     inputEvents.push({character, timestamp: Date.now()})
     await input.press(character)
@@ -57,6 +78,9 @@ export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
 
   await input.blur()
 
+  await page.evaluate(() => window.document.dispatchEvent(new CustomEvent('__finish')))
+
+  const blockingTime = await getBlockingTime()
   const renderEvents = await rendersPromise
 
   await new Promise((resolve) => setTimeout(resolve, 500))
@@ -74,9 +98,10 @@ export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
     return matchingEvent.timestamp - inputEvent.timestamp
   })
 
-  const p50 = 1000 / calculatePercentile(latencies, 0.5)
-  const p75 = 1000 / calculatePercentile(latencies, 0.75)
-  const p90 = 1000 / calculatePercentile(latencies, 0.9)
-
-  return {p50, p75, p90, latencies}
+  return {
+    latency: aggregateLatencies(latencies),
+    blockingTime,
+    label: label || fieldName,
+    runDuration: Date.now() - start,
+  }
 }
diff --git a/perf/efps/helpers/measureFpsForPte.ts b/perf/efps/helpers/measureFpsForPte.ts
index ffa233e9064..06f89786c1b 100644
--- a/perf/efps/helpers/measureFpsForPte.ts
+++ b/perf/efps/helpers/measureFpsForPte.ts
@@ -1,9 +1,22 @@
-import {type Locator} from 'playwright'
+import {type Page} from 'playwright'
 
 import {type EfpsResult} from '../types'
-import {calculatePercentile} from './calculatePercentile'
+import {aggregateLatencies} from './aggregateLatencies'
+import {measureBlockingTime} from './measureBlockingTime'
 
-export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
+interface MeasureFpsForPteOptions {
+  fieldName: string
+  label?: string
+  page: Page
+}
+
+export async function measureFpsForPte({
+  fieldName,
+  page,
+  label,
+}: MeasureFpsForPteOptions): Promise<EfpsResult> {
+  const start = Date.now()
+  const pteField = page.locator(`[data-testid="field-${fieldName}"]`)
   const characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 
   await pteField.waitFor({state: 'visible'})
@@ -24,14 +37,14 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
     }[] = []
 
     const mutationObserver = new MutationObserver(() => {
-      const start = performance.now()
+      const textStart = performance.now()
       const textContent = el.textContent || ''
-      const end = performance.now()
+      const textEnd = performance.now()
 
       updates.push({
         value: textContent,
         timestamp: Date.now(),
-        textContentProcessingTime: end - start,
+        textContentProcessingTime: textEnd - textStart,
       })
     })
 
@@ -63,6 +76,7 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
   await contentEditable.pressSequentially(startingMarker)
   await new Promise((resolve) => setTimeout(resolve, 500))
 
+  const getBlockingTime = measureBlockingTime(page)
   for (const character of characters) {
     inputEvents.push({character, timestamp: Date.now()})
     await contentEditable.press(character)
@@ -71,6 +85,7 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
 
   await contentEditable.blur()
 
+  const blockingTime = await getBlockingTime()
   const renderEvents = await rendersPromise
 
   const latencies = inputEvents.map((inputEvent) => {
@@ -86,9 +101,10 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
     return matchingEvent.timestamp - inputEvent.timestamp - matchingEvent.textContentProcessingTime
   })
 
-  const p50 = 1000 / calculatePercentile(latencies, 0.5)
-  const p75 = 1000 / calculatePercentile(latencies, 0.75)
-  const p90 = 1000 / calculatePercentile(latencies, 0.9)
-
-  return {p50, p75, p90, latencies}
+  return {
+    latency: aggregateLatencies(latencies),
+    blockingTime,
+    label: label || fieldName,
+    runDuration: Date.now() - start,
+  }
 }
diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index c7cb95e7703..6e7e864e962 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -2,6 +2,8 @@
 // eslint-disable-next-line import/no-unassigned-import
 import 'dotenv/config'
 
+import fs from 'node:fs'
+import os from 'node:os'
 import path from 'node:path'
 import process from 'node:process'
 import {fileURLToPath} from 'node:url'
@@ -11,16 +13,22 @@ import chalk from 'chalk'
 import Table from 'cli-table3'
 import Ora from 'ora'
 
-// eslint-disable-next-line import/no-extraneous-dependencies
 import {exec} from './helpers/exec'
 import {runTest} from './runTest'
 import article from './tests/article/article'
 import recipe from './tests/recipe/recipe'
-import singleString from './tests/singleString/singleString'
 import synthetic from './tests/synthetic/synthetic'
+import {type EfpsAbResult, type EfpsResult, type EfpsTest} from './types'
 
-const headless = true
-const tests = [singleString, recipe, article, synthetic]
+const WARNING_THRESHOLD = 0.2
+const TEST_ATTEMPTS = process.env.CI ? 3 : 1
+
+const HEADLESS = true
+// eslint-disable-next-line turbo/no-undeclared-env-vars
+const ENABLE_PROFILER = process.env.ENABLE_PROFILER === 'true'
+// eslint-disable-next-line turbo/no-undeclared-env-vars
+const REFERENCE_TAG = process.env.REFERENCE_TAG || 'latest'
+const TESTS = [article, recipe, synthetic]
 
 const projectId = process.env.VITE_PERF_EFPS_PROJECT_ID!
 const dataset = process.env.VITE_PERF_EFPS_DATASET!
@@ -52,9 +60,58 @@ const resultsDir = path.join(
     .toLowerCase()}`,
 )
 
+const getSanityPkgPathForTag = async (tag: string) => {
+  const tmpDir = path.join(os.tmpdir(), `sanity-${tag}`)
+
+  try {
+    await fs.promises.rm(tmpDir, {recursive: true})
+  } catch {
+    // intentionally blank
+  }
+  await fs.promises.mkdir(tmpDir, {recursive: true})
+
+  await exec({
+    command: `npm install sanity@${tag}`,
+    cwd: tmpDir,
+    spinner,
+    text: [`Downloading sanity@${tag} package…`, `Downloaded sanity@${tag}`],
+  })
+
+  return path.join(tmpDir, 'node_modules', 'sanity')
+}
+
+const formatEfps = (latencyMs: number) => {
+  const efps = 1000 / latencyMs
+  const rounded = efps.toFixed(1)
+
+  if (efps >= 100) return chalk.green('99.9+')
+  if (efps >= 60) return chalk.green(rounded)
+  if (efps >= 20) return chalk.yellow(rounded)
+  return chalk.red(rounded)
+}
+
+const formatPercentageChange = (experiment: number, reference: number): string => {
+  if (experiment < 16 && reference < 16) return '-/-%'
+  const delta = (experiment - reference) / reference
+  if (!delta) return '-/-%'
+  const percentage = delta * 100
+  const rounded = percentage.toFixed(1)
+  const sign = delta >= 0 ? '+' : ''
+  return `${sign}${rounded}%`
+}
+
+// For markdown formatting without colors
+const formatEfpsPlain = (latencyMs: number) => {
+  const efps = 1000 / latencyMs
+  const rounded = efps.toFixed(1)
+
+  if (efps >= 100) return '99.9+'
+  return rounded
+}
+
 const spinner = Ora()
 
-spinner.info(`Running ${tests.length} tests: ${tests.map((t) => `'${t.name}'`).join(', ')}`)
+spinner.info(`Running ${TESTS.length} tests: ${TESTS.map((t) => `'${t.name}'`).join(', ')}`)
 
 await exec({
   text: ['Building the monorepo…', 'Built monorepo'],
@@ -69,47 +126,290 @@ await exec({
   spinner,
 })
 
-const table = new Table({
-  head: [chalk.bold('benchmark'), 'eFPS p50', 'eFPS p75', 'eFPS p90'].map((cell) =>
-    chalk.cyan(cell),
-  ),
-})
+const localSanityPkgPath = path.dirname(fileURLToPath(import.meta.resolve('sanity/package.json')))
 
-const formatFps = (fps: number) => {
-  const rounded = fps.toFixed(1)
-  if (fps >= 60) return chalk.green(rounded)
-  if (fps < 20) return chalk.red(rounded)
-  return chalk.yellow(rounded)
+const referenceSanityPkgPath = await getSanityPkgPathForTag(REFERENCE_TAG)
+const experimentSanityPkgPath = localSanityPkgPath
+
+function mergeResults(baseResults: EfpsResult[] | undefined, incomingResults: EfpsResult[]) {
+  if (!baseResults) return incomingResults
+
+  return incomingResults.map((incomingResult, index) => {
+    const baseResult = baseResults[index]
+
+    const incomingMedianLatency = incomingResult.latency.p50
+    const baseMedianLatency = baseResult.latency.p50
+
+    // if the incoming test run performed better, we'll take that one
+    if (incomingMedianLatency < baseMedianLatency) return incomingResult
+    // otherwise, use the previous run
+    return baseResult
+  })
 }
 
-for (let i = 0; i < tests.length; i++) {
-  const test = tests[i]
-  const results = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}]…`,
-    test,
-    resultsDir,
-    spinner,
-    client,
-    headless,
-    projectId,
+const testResults: Array<{
+  name: string
+  results: EfpsAbResult[]
+}> = []
+
+async function runAbTest(test: EfpsTest) {
+  let referenceResults: EfpsResult[] | undefined
+  let experimentResults: EfpsResult[] | undefined
+
+  for (let attempt = 0; attempt < TEST_ATTEMPTS; attempt++) {
+    const attemptMessage = TEST_ATTEMPTS > 1 ? ` [${attempt + 1}/${TEST_ATTEMPTS}]` : ''
+    const referenceMessage = `Running test '${test.name}' on \`sanity@${REFERENCE_TAG}\`${attemptMessage}`
+    spinner.start(referenceMessage)
+
+    referenceResults = mergeResults(
+      referenceResults,
+      await runTest({
+        key: 'reference',
+        test,
+        resultsDir,
+        client,
+        headless: HEADLESS,
+        enableProfiler: ENABLE_PROFILER,
+        projectId,
+        sanityPkgPath: referenceSanityPkgPath,
+        log: (message) => {
+          spinner.text = `${referenceMessage}: ${message}`
+        },
+      }),
+    )
+    spinner.succeed(`Ran test '${test.name}' on \`sanity@${REFERENCE_TAG}\`${attemptMessage}`)
+
+    const experimentMessage = `Running test '${test.name}' on this branch${attemptMessage}`
+    spinner.start(experimentMessage)
+    experimentResults = mergeResults(
+      experimentResults,
+      await runTest({
+        key: 'experiment',
+        test,
+        resultsDir,
+        client,
+        headless: HEADLESS,
+        enableProfiler: ENABLE_PROFILER,
+        projectId,
+        sanityPkgPath: experimentSanityPkgPath,
+        log: (message) => {
+          spinner.text = `${experimentMessage}: ${message}`
+        },
+      }),
+    )
+    spinner.succeed(`Ran test '${test.name}' on this branch${attemptMessage}`)
+  }
+
+  return experimentResults!.map(
+    (experimentResult, index): EfpsAbResult => ({
+      experiment: experimentResult,
+      reference: referenceResults![index],
+    }),
+  )
+}
+
+for (let i = 0; i < TESTS.length; i++) {
+  const test = TESTS[i]
+  testResults.push({
+    name: test.name,
+    results: await runAbTest(test),
   })
+}
+
+const comparisonTableCli = new Table({
+  head: ['Benchmark', 'reference', 'experiment', 'Δ (%)', ''].map((cell) => chalk.cyan(cell)),
+})
+
+const detailedInformationCliHead = [
+  'Benchmark',
+  'latency',
+  'p75',
+  'p90',
+  'p99',
+  'blocking time',
+  'test duration',
+].map((i) => chalk.cyan(i))
 
-  for (const result of results) {
-    table.push({
-      [[chalk.bold(test.name), result.label ? `(${result.label})` : ''].join(' ')]: [
-        formatFps(result.p50),
-        formatFps(result.p75),
-        formatFps(result.p90),
-      ],
-    })
+const referenceTableCli = new Table({head: detailedInformationCliHead})
+const experimentTableCli = new Table({head: detailedInformationCliHead})
+
+function isSignificantlyDifferent(experiment: number, reference: number) {
+  // values are too small to and are already performing well
+  if (experiment < 16 && reference < 16) return false
+  const delta = (experiment - reference) / reference
+  return delta >= WARNING_THRESHOLD
+}
+
+for (const {name, results} of testResults) {
+  for (const {experiment, reference} of results) {
+    const significantlyDifferent = isSignificantlyDifferent(
+      experiment.latency.p50,
+      reference.latency.p50,
+    )
+
+    const sign = experiment.latency.p50 >= reference.latency.p50 ? '+' : ''
+    const msDifference = `${sign}${(experiment.latency.p50 - reference.latency.p50).toFixed(0)}ms`
+    const percentageChange = formatPercentageChange(experiment.latency.p50, reference.latency.p50)
+
+    const benchmarkName = `${name} (${experiment.label})`
+
+    comparisonTableCli.push([
+      benchmarkName,
+      `${formatEfps(reference.latency.p50)} efps (${reference.latency.p50.toFixed(0)}ms)`,
+      `${formatEfps(experiment.latency.p50)} efps (${experiment.latency.p50.toFixed(0)}ms)`,
+      `${significantlyDifferent ? chalk.red(msDifference) : msDifference} (${percentageChange})`,
+      significantlyDifferent ? '🔴' : '✅',
+    ])
+
+    referenceTableCli.push([
+      benchmarkName,
+      `${reference.latency.p50.toFixed(0)}ms`,
+      `${reference.latency.p75.toFixed(0)}ms`,
+      `${reference.latency.p90.toFixed(0)}ms`,
+      `${reference.latency.p99.toFixed(0)}ms`,
+      `${reference.blockingTime.toFixed(0)}ms`,
+      `${(reference.runDuration / 1000).toFixed(1)}s`,
+    ])
+
+    experimentTableCli.push([
+      benchmarkName,
+      `${experiment.latency.p50.toFixed(0)}ms`,
+      `${experiment.latency.p75.toFixed(0)}ms`,
+      `${experiment.latency.p90.toFixed(0)}ms`,
+      `${experiment.latency.p99.toFixed(0)}ms`,
+      `${experiment.blockingTime.toFixed(0)}ms`,
+      `${(experiment.runDuration / 1000).toFixed(1)}s`,
+    ])
   }
 }
 
-console.log(table.toString())
-console.log(`
+console.log()
+console.log('Reference vs experiment')
+console.log(comparisonTableCli.toString())
+console.log()
+console.log('Reference result')
+console.log(referenceTableCli.toString())
+console.log()
+console.log('Experiment result')
+console.log(experimentTableCli.toString())
+
+let comparisonTable = `
+| Benchmark | reference<br/><sup>latency of \`sanity@${REFERENCE_TAG}\`</sup> | experiment<br/><sup>latency of this branch</sup> | Δ (%)<br/><sup>latency difference</sup> | |
+| :-- | :-- | :-- | :-- | --- |
+`
+
+const detailedInformationHeader = `
+| Benchmark | latency | p75 | p90 | p99 | blocking time | test duration |
+| --------- | ------: | --: | --: | --: | ------------: | ------------: |
+`
+
+let referenceTable = detailedInformationHeader
+let experimentTable = detailedInformationHeader
+
+for (const {name, results} of testResults) {
+  for (const {experiment, reference} of results) {
+    const significantlyDifferent = isSignificantlyDifferent(
+      experiment.latency.p50,
+      reference.latency.p50,
+    )
+
+    const sign = experiment.latency.p50 >= reference.latency.p50 ? '+' : ''
+    const msDifference = `${sign}${(experiment.latency.p50 - reference.latency.p50).toFixed(0)}ms`
+    const percentageChange = formatPercentageChange(experiment.latency.p50, reference.latency.p50)
+
+    const benchmarkName = `${name} (${experiment.label})`
+
+    comparisonTable +=
+      // benchmark name
+      `| ${benchmarkName} ` +
+      // reference latency
+      `| ${formatEfpsPlain(reference.latency.p50)} efps (${reference.latency.p50.toFixed(0)}ms) ` +
+      // experiment latency
+      `| ${formatEfpsPlain(experiment.latency.p50)} efps (${experiment.latency.p50.toFixed(0)}ms) ` +
+      // difference
+      `| ${msDifference} (${percentageChange}) ` +
+      // status
+      `| ${significantlyDifferent ? '🔴' : '✅'} ` +
+      `|\n`
+
+    referenceTable +=
+      // benchmark name
+      `| ${benchmarkName} ` +
+      // latency
+      `| ${reference.latency.p50.toFixed(0)}ms ` +
+      // p75
+      `| ${reference.latency.p75.toFixed(0)}ms ` +
+      // p90
+      `| ${reference.latency.p90.toFixed(0)}ms ` +
+      // p99
+      `| ${reference.latency.p99.toFixed(0)}ms ` +
+      // blocking time
+      `| ${reference.blockingTime.toFixed(0)}ms ` +
+      // test duration
+      `| ${(reference.runDuration / 1000).toFixed(1)}s ` +
+      `|\n`
+
+    experimentTable +=
+      // benchmark name
+      `| ${benchmarkName} ` +
+      // latency
+      `| ${experiment.latency.p50.toFixed(0)}ms ` +
+      // p75
+      `| ${experiment.latency.p75.toFixed(0)}ms ` +
+      // p90
+      `| ${experiment.latency.p90.toFixed(0)}ms ` +
+      // p99
+      `| ${experiment.latency.p99.toFixed(0)}ms ` +
+      // blocking time
+      `| ${experiment.blockingTime.toFixed(0)}ms ` +
+      // test duration
+      `| ${(experiment.runDuration / 1000).toFixed(1)}s ` +
+      `|\n`
+  }
+}
+
+const markdown = `### ⚡️ Editor Performance Report
+
+Updated ${new Date().toUTCString()}
+
+${comparisonTable}
+
+> **efps** — editor "frames per second". The number of updates assumed to be possible within a second.
+>
+> Derived from input latency. \`efps = 1000 / input_latency\`
+
+<details>
+
+<summary><strong>Detailed information</strong></summary>
+
+### 🏠 Reference result
+
+The performance result of \`sanity@${REFERENCE_TAG}\`
+
+
+${referenceTable}
+
+### 🧪 Experiment result
+
+The performance result of this branch
+
+${experimentTable}
+
+### 📚 Glossary
+
+> #### column definitions
+>
+> - **benchmark** — the name of the test, e.g. "article", followed by the label of the field being measured, e.g. "(title)".
+> - **latency** — the time between when a key was pressed and when it was rendered. derived from a set of samples. the median (p50) is shown to show the most common latency.
+> - **p75** — the 75th percentile of the input latency in the test run. 75% of the sampled inputs in this benchmark were processed faster than this value. this provides insight into the upper range of typical performance.
+> - **p90** — the 90th percentile of the input latency in the test run. 90% of the sampled inputs were faster than this. this metric helps identify slower interactions that occurred less frequently during the benchmark.
+> - **p99** — the 99th percentile of the input latency in the test run. only 1% of sampled inputs were slower than this. this represents the worst-case scenarios encountered during the benchmark, useful for identifying potential performance outliers.
+> - **blocking time** — the total time during which the main thread was blocked, preventing user input and UI updates. this metric helps identify performance bottlenecks that may cause the interface to feel unresponsive.
+> - **test duration** — how long the test run took to complete.
+
+</details>
+`
 
-│ ${chalk.bold('eFPS — editor "Frames Per Second"')}
-│
-│ The number of renders ("frames") that is assumed to be possible
-│ within a second. Derived from input latency. ${chalk.green('Higher')} is better.
-`)
+// Write markdown file to root of results
+const markdownOutputPath = path.join(workspaceDir, 'results', 'benchmark-results.md')
+await fs.promises.writeFile(markdownOutputPath, markdown)
diff --git a/perf/efps/runTest.ts b/perf/efps/runTest.ts
index 201487686df..1f93a34f9fc 100644
--- a/perf/efps/runTest.ts
+++ b/perf/efps/runTest.ts
@@ -5,7 +5,6 @@ import {fileURLToPath} from 'node:url'
 
 import {type SanityClient} from '@sanity/client'
 import react from '@vitejs/plugin-react'
-import {type Ora} from 'ora'
 import {chromium} from 'playwright'
 import sourcemaps from 'rollup-plugin-sourcemaps'
 import handler from 'serve-handler'
@@ -17,43 +16,44 @@ import {type EfpsResult, type EfpsTest, type EfpsTestRunnerContext} from './type
 const workspaceDir = path.dirname(fileURLToPath(import.meta.url))
 
 interface RunTestOptions {
-  prefix: string
   test: EfpsTest
   resultsDir: string
-  spinner: Ora
   projectId: string
   headless: boolean
   client: SanityClient
+  sanityPkgPath: string
+  key: string
+  enableProfiler: boolean
+  log: (text: string) => void
 }
 
 export async function runTest({
-  prefix,
   test,
   resultsDir,
-  spinner,
   projectId,
   headless,
   client,
+  sanityPkgPath,
+  key,
+  enableProfiler,
+  log,
 }: RunTestOptions): Promise<EfpsResult[]> {
-  const log = (text: string) => {
-    spinner.text = `${prefix}\n  └ ${text}`
-  }
-
-  spinner.start(prefix)
-
-  const outDir = path.join(workspaceDir, 'builds', test.name)
-  const testResultsDir = path.join(resultsDir, test.name)
+  const outDir = path.join(workspaceDir, 'builds', test.name, key)
+  const testResultsDir = path.join(resultsDir, test.name, key)
 
   await fs.promises.mkdir(outDir, {recursive: true})
   log('Building…')
 
+  const alias: Record<string, string> = {
+    '#config': fileURLToPath(test.configPath!),
+    'sanity': sanityPkgPath,
+  }
+
   await vite.build({
     appType: 'spa',
     build: {outDir, sourcemap: true},
     plugins: [{...sourcemaps(), enforce: 'pre'}, react()],
-    resolve: {
-      alias: {'#config': fileURLToPath(test.configPath!)},
-    },
+    resolve: {alias},
     logLevel: 'silent',
   })
 
@@ -103,17 +103,19 @@ export async function runTest({
       typeof test.document === 'function' ? await test.document(runnerContext) : test.document
     document = await client.create(documentToCreate)
 
-    const cdp = await context.newCDPSession(page)
+    const cdp = enableProfiler ? await context.newCDPSession(page) : null
 
     log('Loading editor…')
     await page.goto(
-      `http://localhost:3300/intent/edit/id=${encodeURIComponent(document._id)};type=${encodeURIComponent(
-        documentToCreate._type,
-      )}`,
+      `http://localhost:3300/intent/edit/id=${encodeURIComponent(
+        document._id,
+      )};type=${encodeURIComponent(documentToCreate._type)}`,
     )
 
-    await cdp.send('Profiler.enable')
-    await cdp.send('Profiler.start')
+    if (cdp) {
+      await cdp.send('Profiler.enable')
+      await cdp.send('Profiler.start')
+    }
 
     log('Benchmarking…')
     const result = await test.run({...runnerContext, document})
@@ -121,24 +123,24 @@ export async function runTest({
     log('Saving results…')
     const results = Array.isArray(result) ? result : [result]
 
-    const {profile} = await cdp.send('Profiler.stop')
-    const remappedProfile = await remapCpuProfile(profile, outDir)
-
     await fs.promises.mkdir(testResultsDir, {recursive: true})
     await fs.promises.writeFile(
       path.join(testResultsDir, 'results.json'),
       JSON.stringify(results, null, 2),
     )
-    await fs.promises.writeFile(
-      path.join(testResultsDir, 'raw.cpuprofile'),
-      JSON.stringify(profile),
-    )
-    await fs.promises.writeFile(
-      path.join(testResultsDir, 'mapped.cpuprofile'),
-      JSON.stringify(remappedProfile),
-    )
 
-    spinner.succeed(`Ran benchmark '${test.name}'`)
+    if (cdp) {
+      const {profile} = await cdp.send('Profiler.stop')
+      await fs.promises.writeFile(
+        path.join(testResultsDir, 'raw.cpuprofile'),
+        JSON.stringify(profile),
+      )
+      const remappedProfile = await remapCpuProfile(profile, outDir)
+      await fs.promises.writeFile(
+        path.join(testResultsDir, 'mapped.cpuprofile'),
+        JSON.stringify(remappedProfile),
+      )
+    }
 
     return results
   } finally {
diff --git a/perf/efps/tests/article/article.ts b/perf/efps/tests/article/article.ts
index f99cdde3b3f..25d88082d88 100644
--- a/perf/efps/tests/article/article.ts
+++ b/perf/efps/tests/article/article.ts
@@ -140,30 +140,10 @@ export default defineEfpsTest({
 
     return document
   },
-  run: async ({page}) => {
-    return [
-      {
-        label: 'title',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-title"] input[type="text"]'),
-        )),
-      },
-      {
-        label: 'body',
-        ...(await measureFpsForPte(page.locator('[data-testid="field-body"]'))),
-      },
-      {
-        label: 'string in object',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-seo.metaTitle"] input[type="text"]'),
-        )),
-      },
-      {
-        label: 'string in array',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-tags"] input[type="text"]').first(),
-        )),
-      },
-    ]
-  },
+  run: async ({page}) => [
+    await measureFpsForInput({page, fieldName: 'title'}),
+    await measureFpsForPte({page, fieldName: 'body'}),
+    await measureFpsForInput({page, fieldName: 'seo.metaTitle', label: 'string inside object'}),
+    await measureFpsForInput({page, fieldName: 'tags', label: 'string inside array'}),
+  ],
 })
diff --git a/perf/efps/tests/recipe/recipe.ts b/perf/efps/tests/recipe/recipe.ts
index 3c9f8e4c230..9b6fc523076 100644
--- a/perf/efps/tests/recipe/recipe.ts
+++ b/perf/efps/tests/recipe/recipe.ts
@@ -160,22 +160,9 @@ export default defineEfpsTest({
 
     return recipe
   },
-  run: async ({page}) => {
-    return [
-      {
-        label: 'name',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-name"] input[type="text"]'),
-        )),
-      },
-      {
-        label: 'description',
-        ...(await measureFpsForInput(page.locator('[data-testid="field-description"] textarea'))),
-      },
-      {
-        label: 'instructions',
-        ...(await measureFpsForPte(page.locator('[data-testid="field-instructions"]'))),
-      },
-    ]
-  },
+  run: async ({page}) => [
+    await measureFpsForInput({page, fieldName: 'name'}),
+    await measureFpsForInput({page, fieldName: 'description'}),
+    await measureFpsForPte({page, fieldName: 'instructions'}),
+  ],
 })
diff --git a/perf/efps/tests/synthetic/synthetic.ts b/perf/efps/tests/synthetic/synthetic.ts
index 2ed731428b8..e0cdc573180 100644
--- a/perf/efps/tests/synthetic/synthetic.ts
+++ b/perf/efps/tests/synthetic/synthetic.ts
@@ -120,20 +120,12 @@ export default defineEfpsTest({
 
     return synthetic
   },
-  run: async ({page}) => {
-    return [
-      {
-        label: 'title',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-title"] input[type="text"]'),
-        )),
-      },
-      {
-        label: 'string in object',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-syntheticObject.name"] input[type="text"]'),
-        )),
-      },
-    ]
-  },
+  run: async ({page}) => [
+    await measureFpsForInput({page, fieldName: 'title'}),
+    await measureFpsForInput({
+      page,
+      fieldName: 'syntheticObject.name',
+      label: 'string inside object',
+    }),
+  ],
 })
diff --git a/perf/efps/types.ts b/perf/efps/types.ts
index 910932926e3..835c64c8bde 100644
--- a/perf/efps/types.ts
+++ b/perf/efps/types.ts
@@ -12,18 +12,25 @@ export interface EfpsTest {
   name: string
   configPath: string | undefined
   document: SanityDocumentStub | ((context: EfpsTestRunnerContext) => Promise<SanityDocumentStub>)
-  run: (context: EfpsTestRunnerContext & {document: SanityDocument}) => Promise<EfpsTestResult>
+  run: (context: EfpsTestRunnerContext & {document: SanityDocument}) => Promise<EfpsResult[]>
 }
 
 export interface EfpsResult {
-  label?: string
-  p50: number
-  p75: number
-  p90: number
-  latencies: number[]
+  label: string
+  runDuration: number
+  blockingTime: number
+  latency: {
+    p50: number
+    p75: number
+    p90: number
+    p99: number
+  }
 }
 
-export type EfpsTestResult = EfpsResult | EfpsResult[]
+export interface EfpsAbResult {
+  experiment: EfpsResult
+  reference: EfpsResult
+}
 
 export function defineEfpsTest(config: EfpsTest): EfpsTest {
   return config