From b91049d82213c7e1d7859c860342e15f6d6b0888 Mon Sep 17 00:00:00 2001 From: Eric Arellano <14852634+Eric-Arellano@users.noreply.github.com> Date: Wed, 28 Feb 2024 14:57:29 -0500 Subject: [PATCH] Split internal and external link checker into separate programs (#916) Closes https://github.com/Qiskit/documentation/issues/876. This split brings several improvements: 1. Simpler internal link checker code. 2. The external link checker now deduplicates links across all files, which avoids repeating requests. 3. The external link checker logs its progress every 20 links. 4. It's possible to check external links without checking internal links. We run the external link checker in our weekly cron job over all files, other than translations and historical Qiskit versions. That change was added in https://github.com/Qiskit/documentation/pull/913. --- .github/workflows/api-checks.yml | 4 +- .github/workflows/main.yml | 4 +- .github/workflows/weekly-checks.yml | 12 +-- README.md | 32 +++++--- package.json | 7 +- scripts/commands/checkExternalLinks.ts | 78 +++++++++++++++++++ .../{checkLinks.ts => checkInternalLinks.ts} | 9 +-- scripts/lib/links/FileBatch.ts | 58 ++++---------- 8 files changed, 127 insertions(+), 77 deletions(-) create mode 100644 scripts/commands/checkExternalLinks.ts rename scripts/commands/{checkLinks.ts => checkInternalLinks.ts} (96%) diff --git a/.github/workflows/api-checks.yml b/.github/workflows/api-checks.yml index 39f7de03dbd..263bdb75677 100644 --- a/.github/workflows/api-checks.yml +++ b/.github/workflows/api-checks.yml @@ -19,7 +19,7 @@ on: - "docs/api/qiskit-ibm-provider/**/*" - "docs/api/qiskit-ibm-runtime/**/*" - "public/api/**/*" - - "scripts/checkLinks.ts" + - "scripts/checkInternalLinks.ts" - "scripts/lib/links/ignores.ts" jobs: @@ -36,7 +36,7 @@ jobs: run: npm ci - name: Check internal links run: > - npm run check:links -- + npm run check:internal-links -- --qiskit-release-notes --current-apis --dev-apis diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ede2b038e6d..c9d773120df 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,7 +31,7 @@ jobs: - name: Spellcheck run: npm run check:spelling - name: Internal link checker - run: npm run check:links + run: npm run check:internal-links - name: Formatting run: npm run check:fmt - name: Typecheck @@ -54,4 +54,4 @@ jobs: if: steps.changed-docs-files.outputs.any_changed == 'true' run: | echo "${{ steps.changed-docs-files.outputs.all_changed_files }}" > changed.txt - npm run check-pages-render -- --from-file changed.txt + npm run check:pages-render -- --from-file changed.txt diff --git a/.github/workflows/weekly-checks.yml b/.github/workflows/weekly-checks.yml index 52fad89e6c9..367fa3f538f 100644 --- a/.github/workflows/weekly-checks.yml +++ b/.github/workflows/weekly-checks.yml @@ -34,7 +34,7 @@ jobs: sleep 20 - name: Check all pages render run: > - npm run check-pages-render -- + npm run check:pages-render -- --non-api --qiskit-release-notes --current-apis @@ -54,10 +54,6 @@ jobs: run: npm ci - name: Check external links run: > - npm run check:links -- - --qiskit-release-notes - --current-apis - --dev-apis - --historical-apis - --skip-broken-historical - --external + npm run check:external-links -- + 'docs/**/*.{md,mdx,ipynb}' + '!docs/api/qiskit/[0-9]*/*' diff --git a/README.md b/README.md index 07963eea9a7..018b3d6ad44 100644 --- a/README.md +++ b/README.md @@ -178,27 +178,35 @@ about it. ## Check for broken links -CI will check for broken links. You can also check locally: +We have two broken link checkers: for internal links and for external links. -```bash -# Only check for internal broken links -npm run check:links +To check internal links: -# Enable the validation of external links -npm run check:links -- --external +```bash +# Only check non-API docs +npm run check:internal-links -# By default, only the non-API docs are checked. You can add any of the -# below arguments to also check API docs. -npm run check:links -- --current-apis --dev-apis --historical-apis --qiskit-release-notes +# You can add any of the below arguments to also check API docs. +npm run check:internal-links -- --current-apis --dev-apis --historical-apis --qiskit-release-notes # However, `--historical-apis` currently has failing versions, so you may # want to add `--skip-broken-historical`. -npm run check:links -- --historical-apis --skip-broken-historical +npm run check:internal-links -- --historical-apis --skip-broken-historical # Or, run all the checks. Although this only checks non-API docs. npm run check ``` +To check external links: + +```bash +# Specify the files you want after `--` +npm run check:external-links -- docs/run/index.md docs/run/circuit-execution.mdx + +# You can also use globs +npm run check:external-links -- 'docs/run/*' '!docs/run/index.mdx' +``` + ## Check file metadata Every file needs to have a `title` and `description`. The `lint` job in CI will fail with instructions for any bad file. @@ -260,9 +268,9 @@ It's possible to write broken pages that crash when loaded. This is usually due To check that all the non-API docs render: 1. Start up the local preview with `./start` by following the instructions at [Preview the docs locally](#preview-the-docs-locally) -2. In a new tab, `npm run check-pages-render -- --non-api` +2. In a new tab, `npm run check:pages-render -- --non-api` -You can also check that API docs and translations render by using any of these arguments: `npm run check-pages-render -- --non-api --qiskit-release-notes --current-apis --dev-apis --historical-apis --translations`. Warning that this is exponentially slower. +You can also check that API docs and translations render by using any of these arguments: `npm run check:pages-render -- --non-api --qiskit-release-notes --current-apis --dev-apis --historical-apis --translations`. Warning that this is exponentially slower. CI will check on every PR that any changed files render correctly. We also run a weekly cron job to check that every page renders correctly. diff --git a/package.json b/package.json index ad66f94f4cd..63836c62883 100644 --- a/package.json +++ b/package.json @@ -5,12 +5,13 @@ "author": "Qiskit Development Team", "license": "Apache-2.0", "scripts": { - "check": "npm run check:metadata && npm run check:spelling && npm run check:links && npm run check:fmt", + "check": "npm run check:metadata && npm run check:spelling && npm run check:internal-links && npm run check:fmt", "check:metadata": "node -r esbuild-register scripts/commands/checkMetadata.ts", - "check:links": "node -r esbuild-register scripts/commands/checkLinks.ts", "check:spelling": "cspell --relative --no-progress docs/**/*.md* docs/api/**/*.md* --config cspell/cSpell.json", "check:fmt": "prettier --check .", - "check-pages-render": "node -r esbuild-register scripts/commands/checkPagesRender.ts", + "check:internal-links": "node -r esbuild-register scripts/commands/checkInternalLinks.ts", + "check:external-links": "node -r esbuild-register scripts/commands/checkExternalLinks.ts", + "check:pages-render": "node -r esbuild-register scripts/commands/checkPagesRender.ts", "fmt": "prettier --write .", "test": "jest", "typecheck": "tsc", diff --git a/scripts/commands/checkExternalLinks.ts b/scripts/commands/checkExternalLinks.ts new file mode 100644 index 00000000000..24aeae4b3a4 --- /dev/null +++ b/scripts/commands/checkExternalLinks.ts @@ -0,0 +1,78 @@ +// This code is a Qiskit project. +// +// (C) Copyright IBM 2024. +// +// This code is licensed under the Apache License, Version 2.0. You may +// obtain a copy of this license in the LICENSE file in the root directory +// of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +// +// Any modifications or derivative works of this code must retain this +// copyright notice, and modified files need to carry a notice indicating +// that they have been altered from the originals. + +import { globby } from "globby"; +import yargs from "yargs/yargs"; +import { hideBin } from "yargs/helpers"; + +import { ExternalLink } from "../lib/links/ExternalLink"; +import { parseFile } from "../lib/links/extractLinks"; +import { addLinksToMap } from "../lib/links/FileBatch"; + +interface Arguments { + [x: string]: unknown; + globs?: string[]; +} + +const readArgs = (): Arguments => { + return yargs(hideBin(process.argv)) + .version(false) + .command("$0 [globs..]", "") + .parseSync(); +}; + +async function main() { + const args = readArgs(); + const filePaths = await globby(args.globs || []); + + const links = await getLinks(filePaths); + + // We use a for loop to reduce the risk of rate limiting. + let allGood = true; + let numLinksChecked = 1; + for (const link of links) { + const result = await link.check(); + + // This script can be slow, so log progress every 20 links. + if (numLinksChecked % 20 == 0) { + console.log(`Checked ${numLinksChecked} / ${links.length} links`); + } + numLinksChecked++; + + if (result === undefined) continue; + console.error(result); + allGood = false; + } + + if (!allGood) { + console.error( + `\n\nšŸ’” Some external links appear broken, although it's possible they are flakes. Checked ${links.length} links.`, + ); + process.exit(1); + } + console.log( + `\n\nāœ… No external links are broken. Checked ${links.length} links.`, + ); +} + +async function getLinks(filePaths: string[]): Promise { + const linksToOriginFiles = new Map(); + for (const filePath of filePaths) { + const parsed = await parseFile(filePath); + addLinksToMap(filePath, parsed.externalLinks, linksToOriginFiles); + } + return Array.from(linksToOriginFiles.entries()).map( + ([link, originFiles]) => new ExternalLink(link, originFiles), + ); +} + +main().then(() => process.exit()); diff --git a/scripts/commands/checkLinks.ts b/scripts/commands/checkInternalLinks.ts similarity index 96% rename from scripts/commands/checkLinks.ts rename to scripts/commands/checkInternalLinks.ts index 649dfcddbaf..6e49659edbe 100644 --- a/scripts/commands/checkLinks.ts +++ b/scripts/commands/checkInternalLinks.ts @@ -29,7 +29,6 @@ const SYNTHETIC_FILES: string[] = [ interface Arguments { [x: string]: unknown; - external: boolean; currentApis: boolean; devApis: boolean; historicalApis: boolean; @@ -40,12 +39,6 @@ interface Arguments { const readArgs = (): Arguments => { return yargs(hideBin(process.argv)) .version(false) - .option("external", { - type: "boolean", - default: false, - description: - "Should external links be checked? This slows down the script, but is useful to check.", - }) .option("current-apis", { type: "boolean", default: false, @@ -91,7 +84,7 @@ async function main() { let allGood = true; for (const fileBatch of fileBatches) { - const allValidLinks = await fileBatch.check(args.external, otherFiles); + const allValidLinks = await fileBatch.checkInternalLinks(otherFiles); if (!allValidLinks) { allGood = false; } diff --git a/scripts/lib/links/FileBatch.ts b/scripts/lib/links/FileBatch.ts index 557d522d512..969d5e601bd 100644 --- a/scripts/lib/links/FileBatch.ts +++ b/scripts/lib/links/FileBatch.ts @@ -13,7 +13,6 @@ import { globby } from "globby"; import { InternalLink, File } from "./InternalLink"; -import { ExternalLink } from "./ExternalLink"; import { ALWAYS_IGNORED_URLS, FILES_TO_IGNORES, @@ -59,75 +58,50 @@ export class FileBatch { /** * Load and process the file batch. * - * Returns a triplet: + * Returns a pair: * 1. A list of `File` objects with their anchors. These represent * the universe of valid internal links for this batch, other * than any additional we may add at check-time, e.g. images. * 2. A list of InternalLink objects to validate. - * 3. A list of ExternalLink objects to validate. */ - async load( - loadExternalLinks: boolean, - ): Promise<[File[], InternalLink[], ExternalLink[]]> { + async load(): Promise<[File[], InternalLink[]]> { const files: File[] = []; for (let filePath of this.toLoad) { const parsed = await parseFile(filePath); files.push(new File(filePath, parsed.anchors)); } - const internalLinksToOriginFiles = new Map(); - const externalLinksToOriginFiles = new Map(); + const linksToOriginFiles = new Map(); for (const filePath of this.toCheck) { const parsed = await parseFile(filePath); files.push(new File(filePath, parsed.anchors)); - addLinksToMap(filePath, parsed.internalLinks, internalLinksToOriginFiles); - if (loadExternalLinks) { - addLinksToMap( - filePath, - parsed.externalLinks, - externalLinksToOriginFiles, - ); - } + addLinksToMap(filePath, parsed.internalLinks, linksToOriginFiles); } - const internalLinks = Array.from(internalLinksToOriginFiles.entries()).map( + const links = Array.from(linksToOriginFiles.entries()).map( ([link, originFiles]) => new InternalLink(link, originFiles), ); - const externalLinks = Array.from(externalLinksToOriginFiles.entries()).map( - ([link, originFiles]) => new ExternalLink(link, originFiles), - ); - return [files, internalLinks, externalLinks]; + return [files, links]; } /** - * Check that all links in this file batch are valid. + * Check that all internal links in this file batch are valid. * * Logs the results to the console and returns `true` if there were no issues. */ - async check( - checkExternalLinks: boolean, - otherFiles: File[], - ): Promise { - console.log(`\n\nChecking links for ${this.description}`); + async checkInternalLinks(otherFiles: File[]): Promise { + console.log(`\n\nChecking internal links for ${this.description}`); - const [docsFiles, internalLinkList, externalLinkList] = - await this.load(checkExternalLinks); + const [docsFiles, links] = await this.load(); const existingFiles = docsFiles.concat(otherFiles); - const results = internalLinkList.map((link) => link.check(existingFiles)); - - // For loop reduces the risk of rate-limiting. - for (let link of externalLinkList) { - results.push(await link.check()); - } - let allGood = true; - results - .filter((res) => res !== undefined) - .forEach((linkError) => { - console.error(linkError); - allGood = false; - }); + links.forEach((link) => { + const result = link.check(existingFiles); + if (result === undefined) return; + console.error(result); + allGood = false; + }); return allGood; } }