From e16e8184b0ed54a1aca1698c543b95db23002379 Mon Sep 17 00:00:00 2001 From: Eric Arellano <14852634+Eric-Arellano@users.noreply.github.com> Date: Mon, 26 Feb 2024 09:36:47 -0500 Subject: [PATCH] Determine internal vs external links at parse time (#884) Part of https://github.com/Qiskit/documentation/issues/876 to split the internal and external link checkers into distinct programs. To do this, it's useful to split out internal vs. external links at parse time, whereas before we did it inside `FileBatch`. The external link checker won't use `FileBatch`. This is only a refactor and doesn't change the program, other than now using `HEAD` for external link requests rather than `GET`. We also now use a `Set` for anchors so that checking if an anchor is included is faster. --------- Co-authored-by: Arnau Casau <47946624+arnaucasau@users.noreply.github.com> --- scripts/commands/checkLinks.ts | 6 ++- scripts/lib/links/ExternalLink.ts | 1 + scripts/lib/links/FileBatch.test.ts | 12 ++++- scripts/lib/links/FileBatch.ts | 69 ++++++++++++++------------ scripts/lib/links/InternalLink.test.ts | 34 +++++++------ scripts/lib/links/InternalLink.ts | 8 +-- scripts/lib/links/extractLinks.test.ts | 28 +++++------ scripts/lib/links/extractLinks.ts | 53 +++++++++++++------- 8 files changed, 125 insertions(+), 86 deletions(-) diff --git a/scripts/commands/checkLinks.ts b/scripts/commands/checkLinks.ts index 07005b1b5fc..0880761b3c7 100644 --- a/scripts/commands/checkLinks.ts +++ b/scripts/commands/checkLinks.ts @@ -83,8 +83,10 @@ async function main() { const fileBatches = await determineFileBatches(args); const otherFiles = [ - ...(await globby("public/**/*")).map((fp) => new File(fp, [])), - ...SYNTHETIC_FILES.map((fp) => new File(fp, [], true)), + ...(await globby("public/{images,videos}/**/*")).map( + (fp) => new File(fp, new Set()), + ), + ...SYNTHETIC_FILES.map((fp) => new File(fp, new Set(), true)), ]; let allGood = true; diff --git a/scripts/lib/links/ExternalLink.ts b/scripts/lib/links/ExternalLink.ts index ff3efa3736a..07e0e11537e 100644 --- a/scripts/lib/links/ExternalLink.ts +++ b/scripts/lib/links/ExternalLink.ts @@ -38,6 +38,7 @@ export class ExternalLink { try { const response = await fetch(this.value, { headers: { "User-Agent": "qiskit-documentation-broken-links-finder" }, + method: "HEAD", }); if (response.status >= 300) { error = `Could not find link '${this.value}'`; diff --git a/scripts/lib/links/FileBatch.test.ts b/scripts/lib/links/FileBatch.test.ts index 04d0fd7b3be..e3029062a26 100644 --- a/scripts/lib/links/FileBatch.test.ts +++ b/scripts/lib/links/FileBatch.test.ts @@ -16,7 +16,11 @@ import { addLinksToMap } from "./FileBatch"; test("addLinksToMap()", () => { const linksToMap = new Map(); - addLinksToMap("file1.md", ["https://ibm.com", "./relative"], linksToMap); + addLinksToMap( + "file1.md", + new Set(["https://ibm.com", "./relative"]), + linksToMap, + ); expect(linksToMap).toEqual( new Map([ ["https://ibm.com", ["file1.md"]], @@ -24,7 +28,11 @@ test("addLinksToMap()", () => { ]), ); - addLinksToMap("file2.md", ["./relative", "/images/my_image.png"], linksToMap); + addLinksToMap( + "file2.md", + new Set(["./relative", "/images/my_image.png"]), + linksToMap, + ); expect(linksToMap).toEqual( new Map([ ["https://ibm.com", ["file1.md"]], diff --git a/scripts/lib/links/FileBatch.ts b/scripts/lib/links/FileBatch.ts index 032798c148a..557d522d512 100644 --- a/scripts/lib/links/FileBatch.ts +++ b/scripts/lib/links/FileBatch.ts @@ -66,43 +66,36 @@ export class FileBatch { * 2. A list of InternalLink objects to validate. * 3. A list of ExternalLink objects to validate. */ - async load(): Promise<[File[], InternalLink[], ExternalLink[]]> { + async load( + loadExternalLinks: boolean, + ): Promise<[File[], InternalLink[], ExternalLink[]]> { const files: File[] = []; for (let filePath of this.toLoad) { const parsed = await parseFile(filePath); files.push(new File(filePath, parsed.anchors)); } - const linksToOriginFiles = new Map(); + const internalLinksToOriginFiles = new Map(); + const externalLinksToOriginFiles = new Map(); for (const filePath of this.toCheck) { const parsed = await parseFile(filePath); files.push(new File(filePath, parsed.anchors)); - if (!IGNORED_FILES.has(filePath)) { - addLinksToMap(filePath, parsed.rawLinks, linksToOriginFiles); - } - } - - const internalLinks: InternalLink[] = []; - const externalLinks: ExternalLink[] = []; - for (let [linkPath, originFiles] of linksToOriginFiles) { - if (ALWAYS_IGNORED_URLS.has(linkPath)) { - continue; - } - originFiles = originFiles.filter( - (originFile) => - FILES_TO_IGNORES[originFile] == null || - !FILES_TO_IGNORES[originFile].includes(linkPath), - ); - - if (originFiles.length > 0) { - if (linkPath.startsWith("http")) { - externalLinks.push(new ExternalLink(linkPath, originFiles)); - } else { - internalLinks.push(new InternalLink(linkPath, originFiles)); - } + addLinksToMap(filePath, parsed.internalLinks, internalLinksToOriginFiles); + if (loadExternalLinks) { + addLinksToMap( + filePath, + parsed.externalLinks, + externalLinksToOriginFiles, + ); } } + const internalLinks = Array.from(internalLinksToOriginFiles.entries()).map( + ([link, originFiles]) => new InternalLink(link, originFiles), + ); + const externalLinks = Array.from(externalLinksToOriginFiles.entries()).map( + ([link, originFiles]) => new ExternalLink(link, originFiles), + ); return [files, internalLinks, externalLinks]; } @@ -111,19 +104,21 @@ export class FileBatch { * * Logs the results to the console and returns `true` if there were no issues. */ - async check(externalLinks: boolean, otherFiles: File[]): Promise { + async check( + checkExternalLinks: boolean, + otherFiles: File[], + ): Promise { console.log(`\n\nChecking links for ${this.description}`); - const [docsFiles, internalLinkList, externalLinkList] = await this.load(); + const [docsFiles, internalLinkList, externalLinkList] = + await this.load(checkExternalLinks); const existingFiles = docsFiles.concat(otherFiles); const results = internalLinkList.map((link) => link.check(existingFiles)); - if (externalLinks) { - // For loop reduces the risk of rate-limiting. - for (let link of externalLinkList) { - results.push(await link.check()); - } + // For loop reduces the risk of rate-limiting. + for (let link of externalLinkList) { + results.push(await link.check()); } let allGood = true; @@ -139,10 +134,18 @@ export class FileBatch { export function addLinksToMap( filePath: string, - links: string[], + links: Set, linksToOriginFiles: Map, ): void { + if (IGNORED_FILES.has(filePath)) return; links.forEach((link) => { + if ( + ALWAYS_IGNORED_URLS.has(link) || + FILES_TO_IGNORES[filePath]?.includes(link) + ) { + return; + } + const entry = linksToOriginFiles.get(link); if (entry === undefined) { linksToOriginFiles.set(link, [filePath]); diff --git a/scripts/lib/links/InternalLink.test.ts b/scripts/lib/links/InternalLink.test.ts index aeaf5b66946..5281bfbe39d 100644 --- a/scripts/lib/links/InternalLink.test.ts +++ b/scripts/lib/links/InternalLink.test.ts @@ -38,7 +38,7 @@ describe("Test the constructor of InternalLink", () => { describe("Validate links", () => { test("existing with absolute path", () => { let testLink = new InternalLink("/testpath", ["/testorigin.mdx"]); - let testFile = new File("docs/testpath.mdx", []); + let testFile = new File("docs/testpath.mdx", new Set()); const results = testLink.check([testFile]); expect(results).toBeUndefined(); }); @@ -47,7 +47,7 @@ describe("Validate links", () => { let testLink = new InternalLink("/test-alternative-path", [ "/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", []); + let testFile = new File("docs/testpath.mdx", new Set()); const results = testLink.check([testFile]); expect(results).toEqual( "❌ Could not find link '/test-alternative-path'. Appears in:\n /testorigin.mdx", @@ -58,7 +58,7 @@ describe("Validate links", () => { let testLink = new InternalLink("../testpath", [ "docs/test/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", []); + let testFile = new File("docs/testpath.mdx", new Set()); const results = testLink.check([testFile]); expect(results).toBeUndefined(); }); @@ -67,7 +67,7 @@ describe("Validate links", () => { let testLink = new InternalLink("../testpath", [ "docs/test1/test2/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", []); + let testFile = new File("docs/testpath.mdx", new Set()); const results = testLink.check([testFile]); expect(results).toEqual( "❌ Could not find link '../testpath'. Appears in:\n docs/test1/test2/testorigin.mdx", @@ -81,8 +81,8 @@ describe("Validate links", () => { "docs/test/test3/testorigin.mdx", "docs/test/test2/test4/testorigin.mdx", ]); - let testFile1 = new File("docs/testpath.mdx", []); - let testFile2 = new File("docs/test/test2/testpath.mdx", []); + let testFile1 = new File("docs/testpath.mdx", new Set()); + let testFile2 = new File("docs/test/test2/testpath.mdx", new Set()); const results = testLink.check([testFile1, testFile2]); expect(results).toBeUndefined(); }); @@ -94,8 +94,8 @@ describe("Validate links", () => { "docs/test/test3/testorigin.mdx", "docs/test/test2/test4/testorigin.mdx", ]); - let testFile1 = new File("docs/test/testpath.mdx", []); - let testFile2 = new File("docs/test2/test3/testpath.mdx", []); + let testFile1 = new File("docs/test/testpath.mdx", new Set()); + let testFile2 = new File("docs/test2/test3/testpath.mdx", new Set()); const results = testLink.check([testFile1, testFile2]); expect(results).toEqual( "❌ Could not find link '/testpath'. Appears in:\n" + @@ -115,8 +115,8 @@ describe("Validate links", () => { "docs/test/test3/testorigin.mdx", "docs/test/test2/test4/testorigin.mdx", ]); - let testFile1 = new File("docs/testpath.mdx", []); - let testFile2 = new File("docs/test/test2/testpath.mdx", []); + let testFile1 = new File("docs/testpath.mdx", new Set()); + let testFile2 = new File("docs/test/test2/testpath.mdx", new Set()); const results = testLink.check([testFile1, testFile2]); expect(results).toEqual( "❌ Could not find link '../testpath'. Appears in:\n docs/test/test2/testorigin.mdx\n docs/test/test3/testorigin.mdx", @@ -127,7 +127,7 @@ describe("Validate links", () => { let testLink = new InternalLink("/testpath#test_anchor", [ "/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", ["#test_anchor"]); + let testFile = new File("docs/testpath.mdx", new Set(["#test_anchor"])); const results = testLink.check([testFile]); expect(results).toBeUndefined(); }); @@ -136,7 +136,10 @@ describe("Validate links", () => { let testLink = new InternalLink("/testpath#test_anchor", [ "/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", ["#test_diff_anchor"]); + let testFile = new File( + "docs/testpath.mdx", + new Set(["#test_diff_anchor"]), + ); const results = testLink.check([testFile]); expect(results).toEqual( "❌ Could not find link '/testpath#test_anchor'. Appears in:\n /testorigin.mdx ❓ Did you mean '/testpath#test_diff_anchor'?", @@ -147,7 +150,7 @@ describe("Validate links", () => { let testLink = new InternalLink("../testpath#test_anchor", [ "docs/test/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", ["#test_anchor"]); + let testFile = new File("docs/testpath.mdx", new Set(["#test_anchor"])); const results = testLink.check([testFile]); expect(results).toBeUndefined(); }); @@ -156,7 +159,10 @@ describe("Validate links", () => { let testLink = new InternalLink("../testpath#test-anchor", [ "docs/test/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", ["#test_diff_anchor"]); + let testFile = new File( + "docs/testpath.mdx", + new Set(["#test_diff_anchor"]), + ); const results = testLink.check([testFile]); expect(results).toEqual( "❌ Could not find link '../testpath#test-anchor'. Appears in:\n docs/test/testorigin.mdx ❓ Did you mean '/testpath#test_diff_anchor'?", diff --git a/scripts/lib/links/InternalLink.ts b/scripts/lib/links/InternalLink.ts index f40f5038d0f..7ec7963f879 100644 --- a/scripts/lib/links/InternalLink.ts +++ b/scripts/lib/links/InternalLink.ts @@ -18,14 +18,14 @@ const CONTENT_FILE_EXTENSIONS = [".md", ".mdx", ".ipynb"]; export class File { readonly path: string; - readonly anchors: string[]; + readonly anchors: Set; readonly synthetic: boolean; /** * path: Path to the file * anchors: Anchors available in the file */ - constructor(path: string, anchors: string[], synthetic: boolean = false) { + constructor(path: string, anchors: Set, synthetic: boolean = false) { this.path = path; this.anchors = anchors; this.synthetic = synthetic; @@ -96,7 +96,7 @@ export class InternalLink { existingFile.path == filePath && (this.anchor == "" || existingFile.synthetic == true || - existingFile.anchors.includes(this.anchor)), + existingFile.anchors.has(this.anchor)), ), ); } @@ -122,7 +122,7 @@ export class InternalLink { if (score < minScoreLink) { minScoreLink = score; suggestionPath = file.path; - suggestionPathAnchors = file.anchors; + suggestionPathAnchors = Array.from(file.anchors); } }); diff --git a/scripts/lib/links/extractLinks.test.ts b/scripts/lib/links/extractLinks.test.ts index 752305df4fd..8c081ab5c6f 100644 --- a/scripts/lib/links/extractLinks.test.ts +++ b/scripts/lib/links/extractLinks.test.ts @@ -63,13 +63,15 @@ test("parseAnchors()", () => { ## \`code-header\` `); - expect(result).toEqual([ - "#my-top-level-heading", - "#header-2", - "#code-header", - "#this-is-a-hardcoded-anchor", - "#another_span", - ]); + expect(result).toEqual( + new Set([ + "#my-top-level-heading", + "#header-2", + "#code-header", + "#this-is-a-hardcoded-anchor", + "#another_span", + ]), + ); }); test("parseLinks()", async () => { @@ -81,11 +83,9 @@ test("parseLinks()", async () => { Explicit anchor `; - const result = await parseLinks(markdown); - expect(result).toEqual([ - "https://ibm.com", - "./relative", - "/images/my_image.png", - "./explicit-anchor", - ]); + const [internalLinks, externalLinks] = await parseLinks(markdown); + expect(internalLinks).toEqual( + new Set(["./relative", "/images/my_image.png", "./explicit-anchor"]), + ); + expect(externalLinks).toEqual(new Set(["https://ibm.com"])); }); diff --git a/scripts/lib/links/extractLinks.ts b/scripts/lib/links/extractLinks.ts index 39ab0b2a901..b9984fb4bc5 100644 --- a/scripts/lib/links/extractLinks.ts +++ b/scripts/lib/links/extractLinks.ts @@ -27,9 +27,11 @@ import { getRoot } from "../fs"; export type ParsedFile = { /** Anchors that the file defines. These can be linked to from other files. */ - anchors: string[]; - /** Links that this file has to other places. These need to be validated. */ - rawLinks: string[]; + anchors: Set; + /** Internal links that this file has to other places. */ + internalLinks: Set; + /** External links that this file has to other places. */ + externalLinks: Set; }; interface JupyterCell { @@ -45,16 +47,28 @@ export function markdownFromNotebook(rawContent: string): string { .join("\n"); } -export function parseAnchors(markdown: string): string[] { +export function parseAnchors(markdown: string): Set { // Anchors generated from markdown titles. const mdAnchors = markdownLinkExtractor(markdown).anchors; // Anchors from HTML id tags. const idAnchors = markdown.match(/(?<=id=")(.*)(?=")/gm) || []; - return [...mdAnchors, ...idAnchors.map((id) => `#${id}`)]; + return new Set([...mdAnchors, ...idAnchors.map((id) => `#${id}`)]); } -export async function parseLinks(markdown: string): Promise { - const result: string[] = []; +export async function parseLinks( + markdown: string, +): Promise<[Set, Set]> { + const internalLinks = new Set(); + const externalLinks = new Set(); + + const addLink = (link: string): void => { + if (link.startsWith("http")) { + externalLinks.add(link); + } else { + internalLinks.add(link); + } + }; + await unified() .use(rehypeParse) .use(remarkGfm) @@ -62,19 +76,19 @@ export async function parseLinks(markdown: string): Promise { .use(() => (tree: Root) => { visit(tree, "text", (TreeNode) => { markdownLinkExtractor(String(TreeNode.value)).links.forEach((url) => - result.push(url), + addLink(url), ); }); - visit(tree, "link", (TreeNode) => result.push(TreeNode.url)); - visit(tree, "image", (TreeNode) => result.push(TreeNode.url)); + visit(tree, "link", (TreeNode) => addLink(TreeNode.url)); + visit(tree, "image", (TreeNode) => addLink(TreeNode.url)); }) .use(remarkStringify) .process(markdown); - return result; + return [internalLinks, externalLinks]; } -async function parseObjectsInv(filePath: string): Promise { +async function parseObjectsInv(filePath: string): Promise> { const absoluteFilePath = path.join( getRoot(), removeSuffix(filePath, "objects.inv"), @@ -82,17 +96,22 @@ async function parseObjectsInv(filePath: string): Promise { const objinv = await ObjectsInv.fromFile(absoluteFilePath); // All URIs are relative to the objects.inv file const dirname = removePrefix(path.dirname(filePath), "public"); - const links = objinv.entries.map((entry) => path.join(dirname, entry.uri)); - return { rawLinks: links, anchors: [] }; + return new Set(objinv.entries.map((entry) => path.join(dirname, entry.uri))); } export async function parseFile(filePath: string): Promise { if (filePath.endsWith(".inv")) { - return await parseObjectsInv(filePath); + const links = await parseObjectsInv(filePath); + return { + internalLinks: links, + externalLinks: new Set(), + anchors: new Set(), + }; } + const source = await readFile(filePath, { encoding: "utf8" }); const markdown = path.extname(filePath) === ".ipynb" ? markdownFromNotebook(source) : source; - const links = await parseLinks(markdown); - return { anchors: parseAnchors(markdown), rawLinks: links }; + const [internalLinks, externalLinks] = await parseLinks(markdown); + return { anchors: parseAnchors(markdown), internalLinks, externalLinks }; }