Skip to content

Issue32792

Issue32792 #413

Workflow file for this run

name: Test Hyperlinks
on:
pull_request:
branches:
- "master"
- "candidate-*"
- "!candidate-9.4.*"
- "!candidate-9.2.*"
- "!candidate-9.0.*"
- "!candidate-8.*"
- "!candidate-7.*"
- "!candidate-6.*"
workflow_call:
inputs:
event-type:
type: string
default: "workflow_call"
required: false
file-path:
type: string
description: Specify the path for the directory or file. To specify multiple directories or files, separate them by Commas(,). Eg. docs/EN_US,docs/PT_BR
default: "docs/"
required: false
file-type:
type: string
description: Specify the files which need to be scanned (md/xml/rst). To specify multiple file types separate them by Commas(,). Eg. xml,md
default: 'xml'
required: false
debug-mode:
type: boolean
description: Run in Debug mode to upload all created files
default: false
required: false
workflow_dispatch:
inputs:
file-path:
type: string
description: Specify the path for the directory or file. To specify multiple directories or files, separate them by Commas(,). Eg. docs/EN_US,devdoc/
default: "/"
required: false
file-type:
type: string
description: Specify the files which need to be scanned (md/xml/rst). To specify multiple file types separate them by Commas(,). Eg. xml,md
default: 'xml,md,rst'
required: false
debug-mode:
type: boolean
description: Run in Debug mode to upload all created files
default: false
required: false
jobs:
main:
runs-on: ubuntu-22.04
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
repository: hpcc-systems/HPCC-Platform
fetch-depth: 2
- name: List Documentation files
run: |
# Determine the event type that triggered this workflow
# When a workflow is triggered by `workflow_call`, it doesn't explicitly provide
# the event type of the call. Instead, it shares the event context of the calling workflow.
# To identify if the workflow was triggered by `workflow_call`, we use an input parameter
# called `event-type`. If this input is provided, it helps us identify that the workflow was
# triggered by `workflow_call`. If the input is not present, we use the github.event_name to determine the event.
if [ -n "${{ inputs.event-type }}" ]; then
EVENT_TYPE="${{ inputs.event-type }}"
else
EVENT_TYPE="${{ github.event_name }}"
fi
touch xmlFilesList.txt mdFilesList.txt rstFilesList.txt
if [[ "${EVENT_TYPE}" == "workflow_dispatch" || "${EVENT_TYPE}" == "workflow_call" ]]; then
IFS=',' read -a DIR_LIST <<< "${{ inputs.file-path }}"
IFS=',' read -a FILE_TYPE_LIST <<< "${{ inputs.file-type }}"
for DIR in ${DIR_LIST[@]}
do
DIR=${PWD}/${DIR} #gets the complete path
DIR=$( realpath ${DIR} ) #gets the actual path ex: HPCC-Platform//docs --> HPCC-Platform/docs
if [[ -f ${DIR} ]]; then #if the specified path points to a file append it to respective list
FILE_TYPE=${DIR##*.} #extract the file extension
echo ${DIR} | tee -a ${FILE_TYPE}FilesList.txt
continue
fi
for FILE_TYPE in ${FILE_TYPE_LIST[@]}
do
FILE_TYPE=${FILE_TYPE#.} #remove leading dot(.) if present
FILE_TYPE=${FILE_TYPE,,} #convert the FILE_TYPE to lowercase
find ${DIR} -name "*.${FILE_TYPE}" -type f | tee -a ${FILE_TYPE}FilesList.txt
# remove if any duplicate files are present
sort -u ${FILE_TYPE}FilesList.txt -o ${FILE_TYPE}FilesList.txt
done
done
elif [[ "${EVENT_TYPE}" == "pull_request" ]]; then
git diff --name-only HEAD^1 HEAD > updatedFiles.txt
cat updatedFiles.txt | grep -E "*\.xml" | tee xmlFilesList.txt
cat updatedFiles.txt | grep -E "*\.md" | tee mdFilesList.txt
cat updatedFiles.txt | grep -E "*\.rst" | tee rstFilesList.txt
fi
- name: List links from Documentation files
run: |
IFS=$'\n'
touch missingFiles.txt
for FILE in $( cat xmlFilesList.txt )
do
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\e[31m file missing\e[0m"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${FILE} | sed 's/url="//' > links.tmp
FLAG=0
for LINE in $( cat links.tmp )
do
LINK=$( echo $LINE | cut -d ':' -f3- )
if [[ ${LINK:0:6} == '<ulink' ]]; then
FLAG=1
continue
elif [[ ${LINK:0:8} == '</ulink>' ]]; then
FLAG=0
continue
fi
if [[ $FLAG -eq 1 ]]; then
echo $LINE >> linksList.txt
fi
done
done
for FILE in $( cat mdFilesList.txt )
do
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\e[31m file missing\e[0m"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e "\]\([^\)]+" -e "\`\`\`" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} | sed 's/](//' > links.tmp
FLAG=0
for LINE in $( cat links.tmp )
do
LINK=$( echo $LINE | cut -d ':' -f3- )
if [[ ${LINK:0:3} == "\`\`\`" ]]; then
FLAG=$(( 1 - FLAG ))
continue
fi
if [[ $FLAG -eq 0 ]]; then
echo $LINE >> linksList.txt
fi
done
done
for FILE in $( cat rstFilesList.txt )
do
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\e[31m file missing\e[0m"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} | sed 's/.. _[^\:]*: //' >> linksList.txt
done
if [[ -f linksList.txt ]]; then
echo "External links: "
cat linksList.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -E 'https://|http://' | tee externalLinks.txt
echo -e "\nInternal links: "
cat linksList.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -vE 'https://|http://' | tee internalLinks.txt
fi
- name: Test External links
run: |
touch checkedLinksCache.txt
IFS=$'\n'
if [[ -f externalLinks.txt ]]; then
for LINE in $(cat externalLinks.txt )
do
LINK=$( echo $LINE | cut -d ':' -f3- )
LINK=${LINK%.} #remove trailing dot(.)
LINK=${LINK% } #remove trailing space
CHECK_CACHE=$( cat checkedLinksCache.txt | grep "$LINK~" | wc -w )
TRY=3 #Max attempts to check status code of hyperlinks
if [[ $CHECK_CACHE -eq 0 ]]; then
while [[ $TRY -ne 0 ]]
do
HTTP_RESPONSE_CODE=$( curl -o /dev/null -m 60 -sL -w "%{response_code}" $LINK ) || true
if [[ $HTTP_RESPONSE_CODE -ne 0 ]]; then
echo "$LINK~$HTTP_RESPONSE_CODE" >> checkedLinksCache.txt
break
else
echo $LINE
echo "retrying..."
TRY=$(( TRY - 1))
fi
done
else
HTTP_RESPONSE_CODE=$( cat checkedLinksCache.txt | grep "$LINK~" | cut -d '~' -f2 )
fi
if [[ $HTTP_RESPONSE_CODE -eq 404 ]]; then
echo -e "${LINK} - \e[31m404 Error\e[0m"
echo "${LINE}" >> error-report.log
elif [[ $HTTP_RESPONSE_CODE -eq 0 ]]; then
HTTP_ERROR_MESSAGE=$( curl -o /dev/null -m 60 -sSL $LINK 2>&1) || true
echo -e "${LINK} - \e[31m${HTTP_ERROR_MESSAGE}\e[0m"
HTTP_ERROR_MESSAGE=$( echo $HTTP_ERROR_MESSAGE | sed 's/ /-/g' )
echo "${LINE}(${HTTP_ERROR_MESSAGE})" >> error-report.log
else
echo "${LINK} - ${HTTP_RESPONSE_CODE}"
fi
done
fi
- name: Test Internal Links
run: |
if [[ -f internalLinks.txt ]]; then
for LINE in $( cat internalLinks.txt )
do
REFERENCE=$( echo $LINE | cut -d ':' -f3- )
FILE=$( echo $LINE | cut -d ':' -f1 )
if [[ ${REFERENCE:0:1} == '#' ]]; then
LINK_TEXT=$( cat $FILE | grep -oE "\[.*\]\(${REFERENCE}\)" | sed 's/\[//' | cut -d ']' -f1 )
IS_PRESENT=$(cat $FILE | grep -oE "# ${LINK_TEXT}" | wc -w)
if [[ $IS_PRESENT -eq 0 ]]; then
echo -e "${LINE} -\e[31m invalid reference\e[0m"
echo "${LINE}" >> error-report.log
else
echo -e "${LINE} -\e[32m valid reference\e[0m"
fi
else
if [[ ${REFERENCE:0:1} == '/' ]]; then
BASE_DIR=$PWD
else
BASE_DIR=${FILE/$( basename $FILE )}
fi
SEARCH_PATH="$BASE_DIR/${REFERENCE}"
SEARCH_PATH=$( realpath $SEARCH_PATH )
# if it is neither a valid file nor valid a directory, then it is an invalid reference
if [[ ! -f $SEARCH_PATH && ! -d $SEARCH_PATH ]]; then
echo -e "${LINE} -\e[31m invalid reference\e[0m"
echo ${LINE/$REFERENCE/$SEARCH_PATH} >> error-report.log
else
echo -e "${LINE} -\e[32m valid reference\e[0m"
fi
fi
done
fi
- name: Report Error links
run: |
if [[ -f error-report.log ]]; then
NUMBER_OF_404_LINKS=$( cat error-report.log | wc -l )
fi
echo -e "\e[32mNo. of files scanned : $( cat *FilesList.txt | wc -l )\e[0m"
if [[ $NUMBER_OF_404_LINKS -ne 0 ]]; then
echo -e "\e[31mNo. of unique broken links : $( cat error-report.log | cut -d: -f3- | sort | uniq | wc -l )\e[0m"
echo -e "\e[31mTotal No. of reference to broken links : $( cat error-report.log | cut -d: -f3- | sort | wc -l )\e[0m"
echo "Checkout the log artifact in the summary page for more details about the broken links."
echo "Note: If any of the reported broken links are just example links or placeholders and are not valid links, please enclose them in triple backticks to ignore them."
echo "For example: \`\`\`https://This/is/not/a/valid/link.com\`\`\`"
exit -1
else
echo -e "\e[32mNo Broken-links found\e[0m"
fi
- name: Modify log file
if: ${{ failure() || cancelled() }}
run: |
BASE_DIR=${PWD}
BASE_DIR=$(echo $BASE_DIR | sed 's/\//\\\//g')
sed -i "s/${BASE_DIR}/HPCC-Platform/g" error-report.log
FILE_NAMES_LIST=$(cat error-report.log | cut -d ':' -f1 | sort | uniq )
FILE_COUNT=1
for LINE in $FILE_NAMES_LIST
do
LINKS_LIST=$( cat error-report.log | grep $LINE | cut -d ':' -f2- )
echo "$FILE_COUNT. $LINE" >> error-reportTmp.log
FILE_COUNT=$(( FILE_COUNT + 1))
for LINK in $LINKS_LIST
do
echo -e "\t Line $LINK" | sed 's/:/ : /' >> error-reportTmp.log
done
done
if [[ $(cat missingFiles.txt | wc -w ) -eq 0 ]]; then
echo -e "Broken links: \n" > error-report.log
cat error-reportTmp.log >> error-report.log
else
echo -e "Missing Files:" > error-report.log
FILE_COUNT=1
for FILE in $( cat missingFiles.txt )
do
echo -e "${FILE_COUNT}. ${FILE}" >> error-report.log
FILE_COUNT=$(( FILE_COUNT + 1 ))
done
echo -e "\nBroken links: \n" >> error-report.log
cat error-reportTmp.log >> error-report.log
fi
if [ -z ${{ inputs.debug-mode }} ]; then
DEBUG_MODE=false
else
DEBUG_MODE=${{ inputs.debug-mode }}
fi
if [[ ${{ github.event_name }} == "pull_request" || $DEBUG_MODE == false ]]; then
rm -rf *FilesList.txt \
checkedLinksCache.txt \
*Links.txt \
linksList.txt
fi
- name: Upload logs
uses: actions/upload-artifact@v4
if: ${{ failure() || cancelled() || inputs.debug-mode == 'true'}}
with:
name: Hyperlinks-testing-log
path: |
./error-report.log
./*FilesList.txt
./checkedLinksCache.txt
./*Links.txt
./linksList.txt
if-no-files-found: ignore