From bacf52544c1529b5f640837a740038c8011ccd6b Mon Sep 17 00:00:00 2001 From: Claas Augner <495429+caugner@users.noreply.github.com> Date: Thu, 8 Feb 2024 16:51:48 +0100 Subject: [PATCH] chore(ai-help): remove old index script (#10484) It is no longer used and caused a stage build to fail. --- .github/workflows/prod-build.yml | 8 - .github/workflows/stage-build.yml | 8 - libs/env/index.d.ts | 2 - libs/env/index.js | 3 - package.json | 2 - scripts/ai-help.ts | 341 ------------------------------ yarn.lock | 71 +------ 7 files changed, 1 insertion(+), 434 deletions(-) delete mode 100644 scripts/ai-help.ts diff --git a/.github/workflows/prod-build.yml b/.github/workflows/prod-build.yml index f21b85c18a56..67888f2121f9 100644 --- a/.github/workflows/prod-build.yml +++ b/.github/workflows/prod-build.yml @@ -372,14 +372,6 @@ jobs: wait $pid done - - name: Update AI Help index - run: yarn ai-help update-index - env: - CONTENT_ROOT: ${{ github.workspace }}/mdn/content/files - OPENAI_KEY: ${{ secrets.OPENAI_KEY }} - SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - - name: Update AI Help index with macros run: yarn ai-help-macros update-index env: diff --git a/.github/workflows/stage-build.yml b/.github/workflows/stage-build.yml index c74e6035d6c0..febb1bcc9b0a 100644 --- a/.github/workflows/stage-build.yml +++ b/.github/workflows/stage-build.yml @@ -365,14 +365,6 @@ jobs: wait $pid done - - name: Update AI Help index - run: yarn ai-help update-index - env: - CONTENT_ROOT: ${{ github.workspace }}/mdn/content/files - OPENAI_KEY: ${{ secrets.OPENAI_KEY }} - SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }} - SUPABASE_URL: ${{ secrets.SUPABASE_URL }} - - name: Update AI Help index with macros run: yarn ai-help-macros update-index env: diff --git a/libs/env/index.d.ts b/libs/env/index.d.ts index d8c5b3b57e4e..149eace111a8 100644 --- a/libs/env/index.d.ts +++ b/libs/env/index.d.ts @@ -31,6 +31,4 @@ export const FAKE_V1_API: boolean; export const SENTRY_DSN_BUILD: string; export const OPENAI_KEY: string; export const PG_URI: string; -export const SUPABASE_URL: string; -export const SUPABASE_SERVICE_ROLE_KEY: string; export const SAMPLE_SIGN_KEY: Buffer; diff --git a/libs/env/index.js b/libs/env/index.js index 9593cd7327a5..adc89c61281f 100644 --- a/libs/env/index.js +++ b/libs/env/index.js @@ -169,9 +169,6 @@ export const FAKE_V1_API = JSON.parse(process.env.SERVER_FAKE_V1_API || false); export const OPENAI_KEY = process.env.OPENAI_KEY || ""; export const PG_URI = process.env.PG_URI || ""; -export const SUPABASE_URL = process.env.SUPABASE_URL || ""; -export const SUPABASE_SERVICE_ROLE_KEY = - process.env.SUPABASE_SERVICE_ROLE_KEY || ""; export const SAMPLE_SIGN_KEY = process.env.BUILD_SAMPLE_SIGN_KEY ? Buffer.from(process.env.BUILD_SAMPLE_SIGN_KEY, "base64") diff --git a/package.json b/package.json index 7954f30a8969..8b3207e3de52 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,6 @@ "yari-tool": "tool/cli.js" }, "scripts": { - "ai-help": "ts-node scripts/ai-help.ts", "ai-help-macros": "ts-node scripts/ai-help-macros.ts", "analyze": "source-map-explorer 'client/build/static/js/*.js'", "analyze:css": "source-map-explorer 'client/build/static/css/*.css'", @@ -157,7 +156,6 @@ "@mdn/minimalist": "^2.0.4", "@playwright/test": "^1.41.2", "@pmmmwh/react-refresh-webpack-plugin": "^0.5.11", - "@supabase/supabase-js": "^2.39.3", "@svgr/webpack": "^8.1.0", "@swc/core": "^1.4.0", "@testing-library/react": "^14.2.1", diff --git a/scripts/ai-help.ts b/scripts/ai-help.ts deleted file mode 100644 index f5c52543f5b6..000000000000 --- a/scripts/ai-help.ts +++ /dev/null @@ -1,341 +0,0 @@ -import { createHash } from "node:crypto"; -import { readFile } from "node:fs/promises"; - -import caporal from "@caporal/core"; -import { SupabaseClient, createClient } from "@supabase/supabase-js"; -import { fdir } from "fdir"; -import frontmatter from "front-matter"; -import OpenAI from "openai"; - -import { DocFrontmatter } from "../libs/types/document.js"; -import { - CONTENT_ROOT, - OPENAI_KEY, - SUPABASE_SERVICE_ROLE_KEY, - SUPABASE_URL, -} from "../libs/env/index.js"; - -const { program } = caporal; - -const MAX_TABLE_LENGTH = 10000; -const IGNORE_SECTIONS = ["Specifications", "Browser compatibility", "See also"]; - -interface IndexedDoc { - id: number; - url: string; - slug: string; - title: string; - token_count: number | null; - checksum: string; -} - -interface Doc { - url: string; - slug: string; - title: string; - content: string; - checksum: string; -} - -export async function updateEmbeddings(directory: string) { - if (!OPENAI_KEY || !SUPABASE_URL || !SUPABASE_SERVICE_ROLE_KEY) { - throw Error( - "Please set these environment variables: OPENAI_KEY, SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY" - ); - } - - // Supabase. - const supabaseClient = createClient(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY); - - // Open AI. - const openai = new OpenAI({ - apiKey: OPENAI_KEY, - }); - - const createEmbedding = async (content: string) => { - // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings) - const input = content.replace(/\n/g, " "); - - let embeddingResponse: OpenAI.Embeddings.CreateEmbeddingResponse; - try { - embeddingResponse = await openai.embeddings.create({ - model: "text-embedding-ada-002", - input, - }); - } catch (e: any) { - const { - data: { - error: { message, type }, - }, - status, - statusText, - } = e.response; - console.error( - `[!] Failed to create embedding (${status} ${statusText}): ${type} - ${message}` - ); - // Try again with trimmed content. - embeddingResponse = await openai.embeddings.create({ - model: "text-embedding-ada-002", - input: input.substring(0, 15000), - }); - } - - const { - data: [{ embedding }], - usage: { total_tokens }, - } = embeddingResponse; - - return { - total_tokens, - embedding, - }; - }; - - console.log(`Retrieving all indexed documents...`); - const existingDocs = await fetchAllExistingDocs(supabaseClient); - console.log(`-> Done.`); - - const existingDocByUrl = new Map( - existingDocs.map((doc) => [doc.url, doc]) - ); - - console.log(`Determining changed and deleted documents...`); - - const seenUrls = new Set(); - const updates: Doc[] = []; - - for await (const { url, slug, title, content } of contentDocs(directory)) { - seenUrls.add(url); - const checksum = createHash("sha256").update(content).digest("base64"); - - // Check for existing document in DB and compare checksums. - const existingDoc = existingDocByUrl.get(url); - - if (existingDoc?.checksum !== checksum) { - updates.push({ - url, - slug, - title, - content, - checksum, - }); - continue; - } else if (existingDoc && existingDoc.token_count === null) { - // (Legacy migration:) Add content, token_count, embedding where missing. - console.log(`-> [${url}] Adding content/token_count/embedding...`); - const { total_tokens, embedding } = await createEmbedding(content); - - await supabaseClient - .from("mdn_doc") - .update({ - content, - token_count: total_tokens, - embedding, - }) - .filter("id", "eq", existingDoc.id) - .throwOnError(); - } - } - console.log( - `-> ${updates.length} of ${seenUrls.size} documents were changed (or added).` - ); - const deletions: IndexedDoc[] = [...existingDocByUrl.entries()] - .filter(([key]) => !seenUrls.has(key)) - .map(([, value]) => value); - console.log( - `-> ${deletions.length} of ${existingDocs.length} indexed documents were deleted (or moved).` - ); - - if (updates.length > 0) { - console.log(`Applying updates...`); - for (const { url, slug, title, content, checksum } of updates) { - try { - console.log(`-> [${url}] Updating document...`); - const existingDoc = existingDocByUrl.get(url); - - if (existingDoc) { - await supabaseClient - .from("mdn_doc_section") - .delete() - .filter("doc_id", "eq", existingDoc.id) - .throwOnError(); - } - - // Embedding for full document. - const { total_tokens, embedding } = await createEmbedding(content); - - // Create/update document record. Intentionally clear checksum until we - // have successfully generated all document sections. - const { data: doc } = await supabaseClient - .from("mdn_doc") - .upsert( - { - checksum: null, - url, - slug, - title, - content, - token_count: total_tokens, - embedding, - }, - { onConflict: "url" } - ) - .select() - .single() - .throwOnError(); - - const sections = splitAndFilterSections(content); - - console.log( - `-> [${url}] Indexing ${sections.length} document sections...` - ); - - await Promise.all( - sections.map(async ({ heading, content }) => { - const { total_tokens, embedding } = await createEmbedding(content); - - await supabaseClient - .from("mdn_doc_section") - .insert({ - doc_id: doc.id, - heading, - content, - token_count: total_tokens, - embedding: embedding, - }) - .select() - .single() - .throwOnError(); - }) - ); - - // Set document checksum so that we know this document was stored successfully - await supabaseClient - .from("mdn_doc") - .update({ checksum }) - .filter("id", "eq", doc.id) - .throwOnError(); - } catch (err: any) { - console.error( - `!> [${url}] Failed to update document. Document has been marked with null checksum to indicate that it needs to be re-generated.` - ); - const context = err?.response?.data ?? err?.response ?? err; - console.error(context); - } - } - console.log(`-> Done.`); - } - - if (deletions.length > 0) { - console.log(`Applying deletions...`); - for (const { id, url } of deletions) { - console.log(`-> [${url}] Deleting indexed document...`); - await supabaseClient.from("mdn_doc").delete().eq("id", id).throwOnError(); - } - console.log(`-> Done.`); - } -} - -async function* contentPaths(directory: string) { - const api = new fdir() - .withFullPaths() - .withErrors() - .filter((filePath) => filePath.endsWith("index.md")) - .crawl(directory); - - const paths = await api.withPromise(); - - for (const path of paths) { - yield path; - } -} - -async function* contentDocs(directory: string) { - for await (const contentPath of contentPaths(directory)) { - const raw = await readFile(contentPath, "utf-8"); - const { attributes, body } = frontmatter(raw); - - const { slug, title } = attributes; - - let content = body; - content = removeLongTables(content); - content = removeTags(content); - content = removeMacroCalls(content); - content = content.trim(); - - yield { - slug, - url: `/en-US/docs/${slug}`, - title, - content: `# ${title}\n\n${content}`, - }; - } -} - -function removeLongTables(str: string): string { - return str.replace(/]*>(?:.*?)<\/table>/gis, (table) => - table.length <= MAX_TABLE_LENGTH ? table : " " - ); -} - -function removeTags(str: string): string { - return str.replace(/<[^>]+>/g, " "); -} - -function removeMacroCalls(str: string): string { - return str.replace(/\{\{.+\}\}/g, " "); -} -function splitAndFilterSections( - str: string -): { heading: string; content: string }[] { - return ( - str - .split(/(?=^## )/gm) - .map((s) => s.trim()) - .filter(Boolean) - .map((section) => { - const [firstLine, ...lines] = section.split("\n"); - const heading = firstLine.replace(/^#+ /, ""); - const content = lines.join("\n").trim(); - return { heading, content }; - }) - .filter(({ heading }) => !IGNORE_SECTIONS.includes(heading)) - // Ignore sections with few words. - .filter(({ content }) => content.split(/\b\w+\b/g).length >= 10) - ); -} -async function fetchAllExistingDocs(supabase: SupabaseClient) { - const PAGE_SIZE = 1000; - const selectDocs = () => - supabase - .from("mdn_doc") - .select("id, url, slug, title, checksum, token_count") - .order("id") - .limit(PAGE_SIZE); - - let { data } = await selectDocs().throwOnError(); - let allData = data; - while (data.length === PAGE_SIZE) { - const lastItem = data[data.length - 1]; - ({ data } = await selectDocs().gt("id", lastItem.id).throwOnError()); - allData = [...allData, ...data]; - } - - return allData; -} - -// CLI. -program - .command( - "update-index", - "Generates OpenAI embeddings for all document sections and uploads them to Supabase." - ) - .argument("", "Path in which to execute git", { - default: CONTENT_ROOT, - }) - .action(function (params) { - const { directory } = params.args as { directory: string }; - return updateEmbeddings(directory); - }); - -program.run(); diff --git a/yarn.lock b/yarn.lock index a2629c525fb1..b099a7314c64 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2445,63 +2445,6 @@ resolved "https://registry.yarnpkg.com/@stripe/stripe-js/-/stripe-js-2.4.0.tgz#7a7e5b187b9e9bb43073edd946ec3e9a778e61bd" integrity sha512-WFkQx1mbs2b5+7looI9IV1BLa3bIApuN3ehp9FP58xGg7KL9hCHDECgW3BwO9l9L+xBPVAD7Yjn1EhGe6EDTeA== -"@supabase/functions-js@^2.1.5": - version "2.1.5" - resolved "https://registry.yarnpkg.com/@supabase/functions-js/-/functions-js-2.1.5.tgz#ed1b85f499dfda21d40fe39b86ab923117cb572b" - integrity sha512-BNzC5XhCzzCaggJ8s53DP+WeHHGT/NfTsx2wUSSGKR2/ikLFQTBCDzMvGz/PxYMqRko/LwncQtKXGOYp1PkPaw== - dependencies: - "@supabase/node-fetch" "^2.6.14" - -"@supabase/gotrue-js@^2.60.0": - version "2.60.1" - resolved "https://registry.yarnpkg.com/@supabase/gotrue-js/-/gotrue-js-2.60.1.tgz#40cd7e9229a22eb70359a909c15681ae10ae2bbb" - integrity sha512-dM28NhyPS5NLWpJbVokxGbuEMmMK2K+EBXYlNU2NEYzp1BrkdxetNh8ucslMbKauJ93XAEhbMCQHSO9fZ2E+DQ== - dependencies: - "@supabase/node-fetch" "^2.6.14" - -"@supabase/node-fetch@^2.6.14": - version "2.6.14" - resolved "https://registry.yarnpkg.com/@supabase/node-fetch/-/node-fetch-2.6.14.tgz#6a3e2924e3de8aeeb82c193c786ffb25da9af23f" - integrity sha512-w/Tsd22e/5fAeoxqQ4P2MX6EyF+iM6rc9kmlMVFkHuG0rAltt2TLhFbDJfemnHbtvnazWaRfy5KnFU/SYT37dQ== - dependencies: - whatwg-url "^5.0.0" - -"@supabase/postgrest-js@^1.9.0": - version "1.9.0" - resolved "https://registry.yarnpkg.com/@supabase/postgrest-js/-/postgrest-js-1.9.0.tgz#00dddbe8119f1ec2179057e563bb54f28e6e31e3" - integrity sha512-axP6cU69jDrLbfihJKQ6vU27tklD0gzb9idkMN363MtTXeJVt5DQNT3JnJ58JVNBdL74hgm26rAsFNvHk+tnSw== - dependencies: - "@supabase/node-fetch" "^2.6.14" - -"@supabase/realtime-js@^2.9.3": - version "2.9.3" - resolved "https://registry.yarnpkg.com/@supabase/realtime-js/-/realtime-js-2.9.3.tgz#f822401aed70883dca5d538179b11089d6d1b6ed" - integrity sha512-lAp50s2n3FhGJFq+wTSXLNIDPw5Y0Wxrgt44eM5nLSA3jZNUUP3Oq2Ccd1CbZdVntPCWLZvJaU//pAd2NE+QnQ== - dependencies: - "@supabase/node-fetch" "^2.6.14" - "@types/phoenix" "^1.5.4" - "@types/ws" "^8.5.10" - ws "^8.14.2" - -"@supabase/storage-js@^2.5.4": - version "2.5.4" - resolved "https://registry.yarnpkg.com/@supabase/storage-js/-/storage-js-2.5.4.tgz#15946fa03574e94cdeff2b7fa2cd5b85880239f5" - integrity sha512-yspHD19I9uQUgfTh0J94+/r/g6hnhdQmw6Y7OWqr/EbnL6uvicGV1i1UDkkmeUHqfF9Mbt2sLtuxRycYyKv2ew== - dependencies: - "@supabase/node-fetch" "^2.6.14" - -"@supabase/supabase-js@^2.39.3": - version "2.39.3" - resolved "https://registry.yarnpkg.com/@supabase/supabase-js/-/supabase-js-2.39.3.tgz#dfdc60337182a0a7b45cafb256281b159b75e25c" - integrity sha512-NoltJSaJNKDJNutO5sJPAAi5RIWrn1z2XH+ig1+cHDojT6BTN7TvZPNa3Kq3gFQWfO5H1N9El/bCTZJ3iFW2kQ== - dependencies: - "@supabase/functions-js" "^2.1.5" - "@supabase/gotrue-js" "^2.60.0" - "@supabase/node-fetch" "^2.6.14" - "@supabase/postgrest-js" "^1.9.0" - "@supabase/realtime-js" "^2.9.3" - "@supabase/storage-js" "^2.5.4" - "@surma/rollup-plugin-off-main-thread@^2.2.3": version "2.2.3" resolved "https://registry.yarnpkg.com/@surma/rollup-plugin-off-main-thread/-/rollup-plugin-off-main-thread-2.2.3.tgz#ee34985952ca21558ab0d952f00298ad2190c053" @@ -3060,11 +3003,6 @@ resolved "https://registry.yarnpkg.com/@types/parse-json/-/parse-json-4.0.2.tgz#5950e50960793055845e956c427fc2b0d70c5239" integrity sha512-dISoDXWWQwUquiKsyZ4Ng+HX2KsPL7LyHKHQwgGFEA3IaKac4Obd+h2a/a6waisAoepJlBcx9paWqjA8/HVjCw== -"@types/phoenix@^1.5.4": - version "1.5.6" - resolved "https://registry.yarnpkg.com/@types/phoenix/-/phoenix-1.5.6.tgz#fca4e7315c7f743bf6f04805588b7261b11818db" - integrity sha512-e7jZ6I9uyRGsg7MNwQcarmBvRlbGb9DibbocE9crVnxqsy6C23RMxLWbJ2CQ3vgCW7taoL1L+F02EcjA6ld7XA== - "@types/prettier@^2.0.0": version "2.7.1" resolved "https://registry.yarnpkg.com/@types/prettier/-/prettier-2.7.1.tgz#dfd20e2dc35f027cdd6c1908e80a5ddc7499670e" @@ -3199,13 +3137,6 @@ resolved "https://registry.yarnpkg.com/@types/wrap-ansi/-/wrap-ansi-3.0.0.tgz#18b97a972f94f60a679fd5c796d96421b9abb9fd" integrity sha512-ltIpx+kM7g/MLRZfkbL7EsCEjfzCcScLpkg37eXEtx5kmrAKBkTJwd1GIAjDSL8wTpM6Hzn5YO4pSb91BEwu1g== -"@types/ws@^8.5.10": - version "8.5.10" - resolved "https://registry.yarnpkg.com/@types/ws/-/ws-8.5.10.tgz#4acfb517970853fa6574a3a6886791d04a396787" - integrity sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A== - dependencies: - "@types/node" "*" - "@types/ws@^8.5.5": version "8.5.5" resolved "https://registry.yarnpkg.com/@types/ws/-/ws-8.5.5.tgz#af587964aa06682702ee6dcbc7be41a80e4b28eb" @@ -16031,7 +15962,7 @@ write-file-atomic@^5.0.1: imurmurhash "^0.1.4" signal-exit "^4.0.1" -ws@^8.11.0, ws@^8.13.0, ws@^8.14.2, ws@^8.16.0: +ws@^8.11.0, ws@^8.13.0, ws@^8.16.0: version "8.16.0" resolved "https://registry.yarnpkg.com/ws/-/ws-8.16.0.tgz#d1cd774f36fbc07165066a60e40323eab6446fd4" integrity sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==