diff --git a/scripts/commands/checkLinks.ts b/scripts/commands/checkLinks.ts index 07005b1b5fc..0880761b3c7 100644 --- a/scripts/commands/checkLinks.ts +++ b/scripts/commands/checkLinks.ts @@ -83,8 +83,10 @@ async function main() { const fileBatches = await determineFileBatches(args); const otherFiles = [ - ...(await globby("public/**/*")).map((fp) => new File(fp, [])), - ...SYNTHETIC_FILES.map((fp) => new File(fp, [], true)), + ...(await globby("public/{images,videos}/**/*")).map( + (fp) => new File(fp, new Set()), + ), + ...SYNTHETIC_FILES.map((fp) => new File(fp, new Set(), true)), ]; let allGood = true; diff --git a/scripts/lib/links/ExternalLink.ts b/scripts/lib/links/ExternalLink.ts index ff3efa3736a..07e0e11537e 100644 --- a/scripts/lib/links/ExternalLink.ts +++ b/scripts/lib/links/ExternalLink.ts @@ -38,6 +38,7 @@ export class ExternalLink { try { const response = await fetch(this.value, { headers: { "User-Agent": "qiskit-documentation-broken-links-finder" }, + method: "HEAD", }); if (response.status >= 300) { error = `Could not find link '${this.value}'`; diff --git a/scripts/lib/links/FileBatch.test.ts b/scripts/lib/links/FileBatch.test.ts index 04d0fd7b3be..e3029062a26 100644 --- a/scripts/lib/links/FileBatch.test.ts +++ b/scripts/lib/links/FileBatch.test.ts @@ -16,7 +16,11 @@ import { addLinksToMap } from "./FileBatch"; test("addLinksToMap()", () => { const linksToMap = new Map(); - addLinksToMap("file1.md", ["https://ibm.com", "./relative"], linksToMap); + addLinksToMap( + "file1.md", + new Set(["https://ibm.com", "./relative"]), + linksToMap, + ); expect(linksToMap).toEqual( new Map([ ["https://ibm.com", ["file1.md"]], @@ -24,7 +28,11 @@ test("addLinksToMap()", () => { ]), ); - addLinksToMap("file2.md", ["./relative", "/images/my_image.png"], linksToMap); + addLinksToMap( + "file2.md", + new Set(["./relative", "/images/my_image.png"]), + linksToMap, + ); expect(linksToMap).toEqual( new Map([ ["https://ibm.com", ["file1.md"]], diff --git a/scripts/lib/links/FileBatch.ts b/scripts/lib/links/FileBatch.ts index 032798c148a..557d522d512 100644 --- a/scripts/lib/links/FileBatch.ts +++ b/scripts/lib/links/FileBatch.ts @@ -66,43 +66,36 @@ export class FileBatch { * 2. A list of InternalLink objects to validate. * 3. A list of ExternalLink objects to validate. */ - async load(): Promise<[File[], InternalLink[], ExternalLink[]]> { + async load( + loadExternalLinks: boolean, + ): Promise<[File[], InternalLink[], ExternalLink[]]> { const files: File[] = []; for (let filePath of this.toLoad) { const parsed = await parseFile(filePath); files.push(new File(filePath, parsed.anchors)); } - const linksToOriginFiles = new Map(); + const internalLinksToOriginFiles = new Map(); + const externalLinksToOriginFiles = new Map(); for (const filePath of this.toCheck) { const parsed = await parseFile(filePath); files.push(new File(filePath, parsed.anchors)); - if (!IGNORED_FILES.has(filePath)) { - addLinksToMap(filePath, parsed.rawLinks, linksToOriginFiles); - } - } - - const internalLinks: InternalLink[] = []; - const externalLinks: ExternalLink[] = []; - for (let [linkPath, originFiles] of linksToOriginFiles) { - if (ALWAYS_IGNORED_URLS.has(linkPath)) { - continue; - } - originFiles = originFiles.filter( - (originFile) => - FILES_TO_IGNORES[originFile] == null || - !FILES_TO_IGNORES[originFile].includes(linkPath), - ); - - if (originFiles.length > 0) { - if (linkPath.startsWith("http")) { - externalLinks.push(new ExternalLink(linkPath, originFiles)); - } else { - internalLinks.push(new InternalLink(linkPath, originFiles)); - } + addLinksToMap(filePath, parsed.internalLinks, internalLinksToOriginFiles); + if (loadExternalLinks) { + addLinksToMap( + filePath, + parsed.externalLinks, + externalLinksToOriginFiles, + ); } } + const internalLinks = Array.from(internalLinksToOriginFiles.entries()).map( + ([link, originFiles]) => new InternalLink(link, originFiles), + ); + const externalLinks = Array.from(externalLinksToOriginFiles.entries()).map( + ([link, originFiles]) => new ExternalLink(link, originFiles), + ); return [files, internalLinks, externalLinks]; } @@ -111,19 +104,21 @@ export class FileBatch { * * Logs the results to the console and returns `true` if there were no issues. */ - async check(externalLinks: boolean, otherFiles: File[]): Promise { + async check( + checkExternalLinks: boolean, + otherFiles: File[], + ): Promise { console.log(`\n\nChecking links for ${this.description}`); - const [docsFiles, internalLinkList, externalLinkList] = await this.load(); + const [docsFiles, internalLinkList, externalLinkList] = + await this.load(checkExternalLinks); const existingFiles = docsFiles.concat(otherFiles); const results = internalLinkList.map((link) => link.check(existingFiles)); - if (externalLinks) { - // For loop reduces the risk of rate-limiting. - for (let link of externalLinkList) { - results.push(await link.check()); - } + // For loop reduces the risk of rate-limiting. + for (let link of externalLinkList) { + results.push(await link.check()); } let allGood = true; @@ -139,10 +134,18 @@ export class FileBatch { export function addLinksToMap( filePath: string, - links: string[], + links: Set, linksToOriginFiles: Map, ): void { + if (IGNORED_FILES.has(filePath)) return; links.forEach((link) => { + if ( + ALWAYS_IGNORED_URLS.has(link) || + FILES_TO_IGNORES[filePath]?.includes(link) + ) { + return; + } + const entry = linksToOriginFiles.get(link); if (entry === undefined) { linksToOriginFiles.set(link, [filePath]); diff --git a/scripts/lib/links/InternalLink.test.ts b/scripts/lib/links/InternalLink.test.ts index aeaf5b66946..5281bfbe39d 100644 --- a/scripts/lib/links/InternalLink.test.ts +++ b/scripts/lib/links/InternalLink.test.ts @@ -38,7 +38,7 @@ describe("Test the constructor of InternalLink", () => { describe("Validate links", () => { test("existing with absolute path", () => { let testLink = new InternalLink("/testpath", ["/testorigin.mdx"]); - let testFile = new File("docs/testpath.mdx", []); + let testFile = new File("docs/testpath.mdx", new Set()); const results = testLink.check([testFile]); expect(results).toBeUndefined(); }); @@ -47,7 +47,7 @@ describe("Validate links", () => { let testLink = new InternalLink("/test-alternative-path", [ "/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", []); + let testFile = new File("docs/testpath.mdx", new Set()); const results = testLink.check([testFile]); expect(results).toEqual( "❌ Could not find link '/test-alternative-path'. Appears in:\n /testorigin.mdx", @@ -58,7 +58,7 @@ describe("Validate links", () => { let testLink = new InternalLink("../testpath", [ "docs/test/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", []); + let testFile = new File("docs/testpath.mdx", new Set()); const results = testLink.check([testFile]); expect(results).toBeUndefined(); }); @@ -67,7 +67,7 @@ describe("Validate links", () => { let testLink = new InternalLink("../testpath", [ "docs/test1/test2/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", []); + let testFile = new File("docs/testpath.mdx", new Set()); const results = testLink.check([testFile]); expect(results).toEqual( "❌ Could not find link '../testpath'. Appears in:\n docs/test1/test2/testorigin.mdx", @@ -81,8 +81,8 @@ describe("Validate links", () => { "docs/test/test3/testorigin.mdx", "docs/test/test2/test4/testorigin.mdx", ]); - let testFile1 = new File("docs/testpath.mdx", []); - let testFile2 = new File("docs/test/test2/testpath.mdx", []); + let testFile1 = new File("docs/testpath.mdx", new Set()); + let testFile2 = new File("docs/test/test2/testpath.mdx", new Set()); const results = testLink.check([testFile1, testFile2]); expect(results).toBeUndefined(); }); @@ -94,8 +94,8 @@ describe("Validate links", () => { "docs/test/test3/testorigin.mdx", "docs/test/test2/test4/testorigin.mdx", ]); - let testFile1 = new File("docs/test/testpath.mdx", []); - let testFile2 = new File("docs/test2/test3/testpath.mdx", []); + let testFile1 = new File("docs/test/testpath.mdx", new Set()); + let testFile2 = new File("docs/test2/test3/testpath.mdx", new Set()); const results = testLink.check([testFile1, testFile2]); expect(results).toEqual( "❌ Could not find link '/testpath'. Appears in:\n" + @@ -115,8 +115,8 @@ describe("Validate links", () => { "docs/test/test3/testorigin.mdx", "docs/test/test2/test4/testorigin.mdx", ]); - let testFile1 = new File("docs/testpath.mdx", []); - let testFile2 = new File("docs/test/test2/testpath.mdx", []); + let testFile1 = new File("docs/testpath.mdx", new Set()); + let testFile2 = new File("docs/test/test2/testpath.mdx", new Set()); const results = testLink.check([testFile1, testFile2]); expect(results).toEqual( "❌ Could not find link '../testpath'. Appears in:\n docs/test/test2/testorigin.mdx\n docs/test/test3/testorigin.mdx", @@ -127,7 +127,7 @@ describe("Validate links", () => { let testLink = new InternalLink("/testpath#test_anchor", [ "/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", ["#test_anchor"]); + let testFile = new File("docs/testpath.mdx", new Set(["#test_anchor"])); const results = testLink.check([testFile]); expect(results).toBeUndefined(); }); @@ -136,7 +136,10 @@ describe("Validate links", () => { let testLink = new InternalLink("/testpath#test_anchor", [ "/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", ["#test_diff_anchor"]); + let testFile = new File( + "docs/testpath.mdx", + new Set(["#test_diff_anchor"]), + ); const results = testLink.check([testFile]); expect(results).toEqual( "❌ Could not find link '/testpath#test_anchor'. Appears in:\n /testorigin.mdx ❓ Did you mean '/testpath#test_diff_anchor'?", @@ -147,7 +150,7 @@ describe("Validate links", () => { let testLink = new InternalLink("../testpath#test_anchor", [ "docs/test/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", ["#test_anchor"]); + let testFile = new File("docs/testpath.mdx", new Set(["#test_anchor"])); const results = testLink.check([testFile]); expect(results).toBeUndefined(); }); @@ -156,7 +159,10 @@ describe("Validate links", () => { let testLink = new InternalLink("../testpath#test-anchor", [ "docs/test/testorigin.mdx", ]); - let testFile = new File("docs/testpath.mdx", ["#test_diff_anchor"]); + let testFile = new File( + "docs/testpath.mdx", + new Set(["#test_diff_anchor"]), + ); const results = testLink.check([testFile]); expect(results).toEqual( "❌ Could not find link '../testpath#test-anchor'. Appears in:\n docs/test/testorigin.mdx ❓ Did you mean '/testpath#test_diff_anchor'?", diff --git a/scripts/lib/links/InternalLink.ts b/scripts/lib/links/InternalLink.ts index f40f5038d0f..7ec7963f879 100644 --- a/scripts/lib/links/InternalLink.ts +++ b/scripts/lib/links/InternalLink.ts @@ -18,14 +18,14 @@ const CONTENT_FILE_EXTENSIONS = [".md", ".mdx", ".ipynb"]; export class File { readonly path: string; - readonly anchors: string[]; + readonly anchors: Set; readonly synthetic: boolean; /** * path: Path to the file * anchors: Anchors available in the file */ - constructor(path: string, anchors: string[], synthetic: boolean = false) { + constructor(path: string, anchors: Set, synthetic: boolean = false) { this.path = path; this.anchors = anchors; this.synthetic = synthetic; @@ -96,7 +96,7 @@ export class InternalLink { existingFile.path == filePath && (this.anchor == "" || existingFile.synthetic == true || - existingFile.anchors.includes(this.anchor)), + existingFile.anchors.has(this.anchor)), ), ); } @@ -122,7 +122,7 @@ export class InternalLink { if (score < minScoreLink) { minScoreLink = score; suggestionPath = file.path; - suggestionPathAnchors = file.anchors; + suggestionPathAnchors = Array.from(file.anchors); } }); diff --git a/scripts/lib/links/extractLinks.test.ts b/scripts/lib/links/extractLinks.test.ts index 752305df4fd..8c081ab5c6f 100644 --- a/scripts/lib/links/extractLinks.test.ts +++ b/scripts/lib/links/extractLinks.test.ts @@ -63,13 +63,15 @@ test("parseAnchors()", () => { ## \`code-header\` `); - expect(result).toEqual([ - "#my-top-level-heading", - "#header-2", - "#code-header", - "#this-is-a-hardcoded-anchor", - "#another_span", - ]); + expect(result).toEqual( + new Set([ + "#my-top-level-heading", + "#header-2", + "#code-header", + "#this-is-a-hardcoded-anchor", + "#another_span", + ]), + ); }); test("parseLinks()", async () => { @@ -81,11 +83,9 @@ test("parseLinks()", async () => { Explicit anchor `; - const result = await parseLinks(markdown); - expect(result).toEqual([ - "https://ibm.com", - "./relative", - "/images/my_image.png", - "./explicit-anchor", - ]); + const [internalLinks, externalLinks] = await parseLinks(markdown); + expect(internalLinks).toEqual( + new Set(["./relative", "/images/my_image.png", "./explicit-anchor"]), + ); + expect(externalLinks).toEqual(new Set(["https://ibm.com"])); }); diff --git a/scripts/lib/links/extractLinks.ts b/scripts/lib/links/extractLinks.ts index 39ab0b2a901..b9984fb4bc5 100644 --- a/scripts/lib/links/extractLinks.ts +++ b/scripts/lib/links/extractLinks.ts @@ -27,9 +27,11 @@ import { getRoot } from "../fs"; export type ParsedFile = { /** Anchors that the file defines. These can be linked to from other files. */ - anchors: string[]; - /** Links that this file has to other places. These need to be validated. */ - rawLinks: string[]; + anchors: Set; + /** Internal links that this file has to other places. */ + internalLinks: Set; + /** External links that this file has to other places. */ + externalLinks: Set; }; interface JupyterCell { @@ -45,16 +47,28 @@ export function markdownFromNotebook(rawContent: string): string { .join("\n"); } -export function parseAnchors(markdown: string): string[] { +export function parseAnchors(markdown: string): Set { // Anchors generated from markdown titles. const mdAnchors = markdownLinkExtractor(markdown).anchors; // Anchors from HTML id tags. const idAnchors = markdown.match(/(?<=id=")(.*)(?=")/gm) || []; - return [...mdAnchors, ...idAnchors.map((id) => `#${id}`)]; + return new Set([...mdAnchors, ...idAnchors.map((id) => `#${id}`)]); } -export async function parseLinks(markdown: string): Promise { - const result: string[] = []; +export async function parseLinks( + markdown: string, +): Promise<[Set, Set]> { + const internalLinks = new Set(); + const externalLinks = new Set(); + + const addLink = (link: string): void => { + if (link.startsWith("http")) { + externalLinks.add(link); + } else { + internalLinks.add(link); + } + }; + await unified() .use(rehypeParse) .use(remarkGfm) @@ -62,19 +76,19 @@ export async function parseLinks(markdown: string): Promise { .use(() => (tree: Root) => { visit(tree, "text", (TreeNode) => { markdownLinkExtractor(String(TreeNode.value)).links.forEach((url) => - result.push(url), + addLink(url), ); }); - visit(tree, "link", (TreeNode) => result.push(TreeNode.url)); - visit(tree, "image", (TreeNode) => result.push(TreeNode.url)); + visit(tree, "link", (TreeNode) => addLink(TreeNode.url)); + visit(tree, "image", (TreeNode) => addLink(TreeNode.url)); }) .use(remarkStringify) .process(markdown); - return result; + return [internalLinks, externalLinks]; } -async function parseObjectsInv(filePath: string): Promise { +async function parseObjectsInv(filePath: string): Promise> { const absoluteFilePath = path.join( getRoot(), removeSuffix(filePath, "objects.inv"), @@ -82,17 +96,22 @@ async function parseObjectsInv(filePath: string): Promise { const objinv = await ObjectsInv.fromFile(absoluteFilePath); // All URIs are relative to the objects.inv file const dirname = removePrefix(path.dirname(filePath), "public"); - const links = objinv.entries.map((entry) => path.join(dirname, entry.uri)); - return { rawLinks: links, anchors: [] }; + return new Set(objinv.entries.map((entry) => path.join(dirname, entry.uri))); } export async function parseFile(filePath: string): Promise { if (filePath.endsWith(".inv")) { - return await parseObjectsInv(filePath); + const links = await parseObjectsInv(filePath); + return { + internalLinks: links, + externalLinks: new Set(), + anchors: new Set(), + }; } + const source = await readFile(filePath, { encoding: "utf8" }); const markdown = path.extname(filePath) === ".ipynb" ? markdownFromNotebook(source) : source; - const links = await parseLinks(markdown); - return { anchors: parseAnchors(markdown), rawLinks: links }; + const [internalLinks, externalLinks] = await parseLinks(markdown); + return { anchors: parseAnchors(markdown), internalLinks, externalLinks }; }