Skip to content

Commit

Permalink
docs: fix broken links and adjust linkinator config DOC-1382 (#3928)
Browse files Browse the repository at this point in the history
  • Loading branch information
addetz committed Sep 17, 2024
1 parent e7e4ef6 commit 91bf6ed
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 70 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/url-checks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ jobs:
- name: URL Checker
run: make verify-url-links-ci

- name: URL Security Bulletins Checker
run: make verify-security-bulletins-links-ci
- name: URL Rate Limit Checker
run: make verify-rate-limited-links-ci

- name: Post Comment
run: |
Expand Down
88 changes: 26 additions & 62 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@ CPUS := $(shell sysctl -n hw.ncpu | awk '{print int($$1 / 2)}')
ALOGLIA_CONFIG=$(shell cat docsearch.dev.config.json | jq -r tostring)

# Find all *.md files in docs, cut the prefix ./
# Remove all security-bulletins and cve-reports.md
VERIFY_URL_PATHS=$(shell find ./docs -name "*.md" | cut -c 3- | sed '/security-bulletins/d' | sed '/cve-reports/d' )
# Remove all security-bulletins and cve-reports.md because they are rate limited by nvd.nist.gov
# Remove oss-licenses.md because they are rate limited by npmjs.com
VERIFY_URL_PATHS=$(shell find ./docs -name "*.md" | cut -c 3- | sed '/security-bulletins/d' | sed '/cve-reports/d' | sed '/oss-licenses/d')

RATE_LIMITED_FILES_LIST:="docs/docs-content/security-bulletins/**/*.md" \
"docs/docs-content/security-bulletins/*.md" \
"docs/docs-content/unlisted/cve-reports.md" \
"docs/docs-content/legal-licenses/oss-licenses.md"

help: ## Display this help
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[0m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
Expand Down Expand Up @@ -175,78 +181,36 @@ pdf-local: ## Generate PDF from local docs
verify-url-links:
@echo "Checking for broken external URLs in markdown files..."
rm link_report.csv || echo "No report exists. Proceeding to scan step"
@npx linkinator $(VERIFY_URL_PATHS) --concurrency 50 --markdown --recurse --timeout 100000 --retry --retry-errors-jitter --retry-errors-count 5 \
--skip "^https:\/\/docs\.spectrocloud\.com.*$$" \
--skip "^https:\/\/docs\.spectrocloud\.com\/.*\/supplemental\-packs$$" \
--skip "^http:\/\/docs\.spectrocloud\.com.*$$" \
--skip "^https:\/\/software-private\.spectrocloud\.com.*$$" \
--skip "^\/.*\.md$$" \
--skip "!\[.*\]\(.*\)$$" \
--skip "\.(jpg|jpeg|png|gif|webp)$$" \
--skip "https:\/\/linux\.die\.net\/man\/.*$$" \
--skip "https:\/\/mysql\.com\/.*\.*$$" \
--skip "https:\/\/dev\.mysql\.com\/doc\/.*$$" \
--format csv > temp_report.csv && sleep 2
@npx linkinator $(VERIFY_URL_PATHS) --config ./linkinator/linkinator.config.json > temp_report.csv && sleep 2
@grep -E 'https?://' temp_report.csv > filtered_report.csv
@grep -E ',[[:space:]]*([4-9][0-9]{2}|[0-9]{4,}),' filtered_report.csv > link_report.csv && rm temp_report.csv filtered_report.csv

verify-security-bulletins-links:
@echo "Checking for broken URLs in security-bulletins markdown files..."
rm link_sec_bul_report.csv || echo "No security bulletins report exists. Proceeding to scan step"
@npx linkinator "docs/docs-content/security-bulletins/**/*.md" "docs/docs-content/security-bulletins/*.md" "docs/docs-content/unlisted/cve-reports.md" --concurrency 1 --markdown --recurse --timeout 100000 --retry --retry-errors-jitter --retry-errors-count 5 \
--skip "^https:\/\/docs\.spectrocloud\.com.*$$" \
--skip "^https:\/\/docs\.spectrocloud\.com\/.*\/supplemental\-packs$$" \
--skip "^http:\/\/docs\.spectrocloud\.com.*$$" \
--skip "^https:\/\/software-private\.spectrocloud\.com.*$$" \
--skip "^\/.*\.md$$" \
--skip "!\[.*\]\(.*\)$$" \
--skip "\.(jpg|jpeg|png|gif|webp)$$" \
--skip "https:\/\/linux\.die\.net\/man\/.*$$" \
--skip "https:\/\/mysql\.com\/.*\.*$$" \
--skip "https:\/\/dev\.mysql\.com\/doc\/.*$$" \
--format csv > temp_sec_bul_report.csv && sleep 2
@grep -E 'https?://' temp_sec_bul_report.csv > filtered_sec_bul_report.csv
@grep -E ',[[:space:]]*([4-9][0-9]{2}|[0-9]{4,}),' filtered_sec_bul_report.csv > link_sec_bul_report.csv && rm temp_sec_bul_report.csv filtered_sec_bul_report.csv
verify-rate-limited-links:
@echo "Checking for broken URLs in security-bulletins and oss-licenses markdown files..."
@rm link_rate_limit_report.csv || echo "No rate limited report exists. Proceeding to scan step"
@echo "Checking the following paths: $(RATE_LIMITED_FILES_LIST)"
@npx linkinator $(RATE_LIMITED_FILES_LIST) --config ./linkinator/linkinator-rate-limit.config.json > temp_rate_limit_report.csv && sleep 2
@grep -E 'https?://' temp_rate_limit_report.csv > filtered_rate_limit_report.csv
@grep -E ',[[:space:]]*([4-9][0-9]{2}|[0-9]{4,}),' filtered_rate_limit_report.csv > link_rate_limit_report.csv && rm temp_rate_limit_report.csv filtered_rate_limit_report.csv

verify-url-links-ci: ## Check for broken URLs in production in a GitHub Actions CI environment
@echo "Checking for broken external URLs in CI environment..."
rm link_report.json || echo "No report exists. Proceeding to scan step"
@npx linkinator $(VERIFY_URL_PATHS) --concurrency 50 --markdown --recurse --timeout 100000 --retry --retry-errors-jitter --retry-errors-count 5 \
--skip "^https:\/\/docs\.spectrocloud\.com.*$$" \
--skip "^https:\/\/docs\.spectrocloud\.com\/.*\/supplemental\-packs$$" \
--skip "^http:\/\/docs\.spectrocloud\.com.*$$" \
--skip "^https:\/\/software-private\.spectrocloud\.com.*$$" \
--skip "^\/.*\.md$$" \
--skip "!\[.*\]\(.*\)$$" \
--skip "\.(jpg|jpeg|png|gif|webp)$$" \
--skip "https:\/\/linux\.die\.net\/man\/.*$$" \
--skip "https:\/\/mysql\.com\/.*\.*$$" \
--skip "https:\/\/dev\.mysql\.com\/doc\/.*$$" \
--format json > temp_report.json
@rm link_report.json || echo "No report exists. Proceeding to scan step"
@npx linkinator $(VERIFY_URL_PATHS) --config ./linkinator/linkinator-ci.config.json > temp_report.json
@# Use jq to filter out links that do not start with http or https and keep only broken links
@jq '[.links[] | select(.url | test("^https?://")) | select(.status >= 400)]' temp_report.json > filtered_report.json
@rm temp_report.json
@mv filtered_report.json scripts/link_report.json

verify-security-bulletins-links-ci: ## Check for broken URLs in production in a GitHub Actions CI environment
@echo "Checking for broken URLs in security-bulletins markdown files in CI environment..."
rm link_sec_bul_report.json || echo "No security bulletins report exists. Proceeding to scan step"
@npx linkinator "docs/docs-content/security-bulletins/**/*.md" "docs/docs-content/security-bulletins/*.md" "docs/docs-content/unlisted/cve-reports.md" --concurrency 1 --markdown --recurse --timeout 100000 --retry --retry-errors-jitter --retry-errors-count 5 \
--skip "^https:\/\/docs\.spectrocloud\.com.*$$" \
--skip "^https:\/\/docs\.spectrocloud\.com\/.*\/supplemental\-packs$$" \
--skip "^http:\/\/docs\.spectrocloud\.com.*$$" \
--skip "^https:\/\/software-private\.spectrocloud\.com.*$$" \
--skip "^\/.*\.md$$" \
--skip "!\[.*\]\(.*\)$$" \
--skip "\.(jpg|jpeg|png|gif|webp)$$" \
--skip "https:\/\/linux\.die\.net\/man\/.*$$" \
--skip "https:\/\/mysql\.com\/.*\.*$$" \
--skip "https:\/\/dev\.mysql\.com\/doc\/.*$$" \
--format json > temp_sec_bul_report.json
verify-rate-limited-links-ci: ## Check for broken URLs in production in a GitHub Actions CI environment
@echo "Checking for broken URLs in security-bulletins and oss-licenses markdown files in CI environment..."
@rm link_rate_limit_report.json || echo "No rate limited report exists. Proceeding to scan step"
@echo "Checking the following paths: $(RATE_LIMITED_FILES_LIST)"
@npx linkinator $(RATE_LIMITED_FILES_LIST) --config ./linkinator/linkinator-rate-limit-ci.config.json > temp_rate_limit_report.json
@# Use jq to filter out links that do not start with http or https and keep only broken links
@jq '[.links[] | select(.url | test("^https?://")) | select(.status >= 400)]' temp_sec_bul_report.json > filtered_sec_bul_report.json
@rm temp_sec_bul_report.json
@mv filtered_sec_bul_report.json scripts/link_sec_bul_report.json
@jq '[.links[] | select(.url | test("^https?://")) | select(.status >= 400)]' temp_rate_limit_report.json > filtered_rate_limit_report.json
@rm temp_rate_limit_report.json
@mv filtered_rate_limit_report.json scripts/link_rate_limit_report.json

###@ Image Formatting

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ The following software must be installed on your system:
- [tar](https://www.gnu.org/software/tar/) - This is only required if you need to deploy the offline documentation to a
device without internet access.

- [cosign](https://docs.sigstore.dev/system_config/installation) - Not required unless you want to verify the
- [cosign](https://docs.sigstore.dev/cosign/system_config/installation/) - Not required unless you want to verify the
authenticity of the container image. Review the [Container Image Authenticity](#container-image-authenticity) section
for more information.

Expand Down Expand Up @@ -113,8 +113,8 @@ image is signed using a cryptographic key pair that is private and stored intern
documentation repository at
[**static/cosign.pub**](https://raw.githubusercontent.com/spectrocloud/librarium/master/static/cosign.pub). Use the
public key to verify the authenticity of the container image. You can learn more about the container image signing
process by reviewing the [Signing Containers](https://docs.sigstore.dev/signing/signing_with_containers) documentation
page.
process by reviewing the [Signing Containers](https://docs.sigstore.dev/cosign/signing/signing_with_containers/)
documentation page.

:::info

Expand Down
25 changes: 25 additions & 0 deletions linkinator/linkinator-ci.config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"concurrency": 50,
"markdown": true,
"recurse": true,
"timeout": 100000,
"retry": true,
"retryErrors": true,
"retryErrorsJitter": 10000,
"retryErrorsCount": 5,
"format": "json",
"skip": [
"^https://docs.spectrocloud.com.*$$",
"^https://docs.spectrocloud.com/.*/supplemental-packs$$",
"^http://docs.spectrocloud.com.*$$",
"^https://software-private.spectrocloud.com.*$$",
"^/.*.md$$",
"![.*](.*)$$",
".(jpg|jpeg|png|gif|webp)$$",
"https://linux.die.net/man/.*$$",
"https://mysql.com/.*.*$$",
"https://dev.mysql.com/doc/.*$$"
],
"verbosity": "error",
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"
}
25 changes: 25 additions & 0 deletions linkinator/linkinator-rate-limit-ci.config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"concurrency": 1,
"markdown": true,
"recurse": false,
"timeout": 100000,
"retry": true,
"retryErrors": true,
"retryErrorsJitter": 10000,
"retryErrorsCount": 5,
"format": "json",
"skip": [
"^https://docs.spectrocloud.com.*$$",
"^https://docs.spectrocloud.com/.*/supplemental-packs$$",
"^http://docs.spectrocloud.com.*$$",
"^https://software-private.spectrocloud.com.*$$",
"^/.*.md$$",
"![.*](.*)$$",
".(jpg|jpeg|png|gif|webp)$$",
"https://linux.die.net/man/.*$$",
"https://mysql.com/.*.*$$",
"https://dev.mysql.com/doc/.*$$"
],
"verbosity": "error",
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"
}
26 changes: 26 additions & 0 deletions linkinator/linkinator-rate-limit.config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"concurrency": 1,
"markdown": true,
"recurse": false,
"timeout": 100000,
"retry": true,
"retryErrors": true,
"retryErrorsJitter": 10000,
"retryErrorsCount": 5,
"format": "csv",
"skip": [
"^https://docs.spectrocloud.com.*$$",
"^https://docs.spectrocloud.com/.*/supplemental-packs$$",
"^http://docs.spectrocloud.com.*$$",
"^https://software-private.spectrocloud.com.*$$",
"^/.*.md$$",
"^/.*.md#*$$",
"![.*](.*)$$",
".(jpg|jpeg|png|gif|webp)$$",
"https://linux.die.net/man/.*$$",
"https://mysql.com/.*.*$$",
"https://dev.mysql.com/doc/.*$$"
],
"verbosity": "error",
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"
}
25 changes: 25 additions & 0 deletions linkinator/linkinator.config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"concurrency": 50,
"markdown": true,
"recurse": true,
"timeout": 100000,
"retry": true,
"retryErrors": true,
"retryErrorsJitter": 10000,
"retryErrorsCount": 5,
"format": "csv",
"skip": [
"^https://docs.spectrocloud.com.*$$",
"^https://docs.spectrocloud.com/.*/supplemental-packs$$",
"^http://docs.spectrocloud.com.*$$",
"^https://software-private.spectrocloud.com.*$$",
"^/.*.md$$",
"![.*](.*)$$",
".(jpg|jpeg|png|gif|webp)$$",
"https://linux.die.net/man/.*$$",
"https://mysql.com/.*.*$$",
"https://dev.mysql.com/doc/.*$$"
],
"verbosity": "error",
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"
}
6 changes: 3 additions & 3 deletions scripts/url-checker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ echo "Pull request number: $PR_NUMBER"

# Read JSON file contents into a variable
JSON_CONTENT=$(cat link_report.json)
JSON_SEC_BUL_CONTENT=$(cat link_sec_bul_report.json)
JSON_RATE_LIMIT_CONTENT=$(cat link_rate_limit_report.json)


# Check if JSON file is empty
if [[ -z "$JSON_CONTENT" ]] && [[ -z "$JSON_SEC_BUL_CONTENT" ]]; then
if [[ -z "$JSON_CONTENT" ]] && [[ -z "$JSON_RATE_LIMIT_CONTENT" ]]; then
echo "No broken links found"
exit 0
fi
Expand All @@ -58,7 +58,7 @@ for link in $(echo "${JSON_CONTENT}" | jq -c '.[]'); do
COMMENT="${COMMENT}\n\n:link: Broken URL: ${url} \n:red_circle: State: ${state} \n:arrow_up: Parent Page: ${parent}\n\n"
done

for link in $(echo "${JSON_SEC_BUL_CONTENT}" | jq -c '.[]'); do
for link in $(echo "${JSON_RATE_LIMIT_CONTENT}" | jq -c '.[]'); do
url=$(echo "${link}" | jq -r '.url')
status=$(echo "${link}" | jq -r '.status')
state=$(echo "${link}" | jq -r '.state')
Expand Down

0 comments on commit 91bf6ed

Please sign in to comment.