HPCC-32795 Additional optimizations to replaceString #461

Summary
Jobs
- main
Run details
- Usage
- Workflow file

Workflow file for this run

.github/workflows/test-hyperlinks.yml at b6dc378

	name: Test Hyperlinks

	on:
	pull_request:
	branches:
	- "master"
	- "candidate-*"
	- "!candidate-9.4.*"
	- "!candidate-9.2.*"
	- "!candidate-9.0.*"
	- "!candidate-8.*"
	- "!candidate-7.*"
	- "!candidate-6.*"

	workflow_call:
	inputs:
	event-type:
	type: string
	default: "workflow_call"
	required: false
	file-path:
	type: string
	description: Specify the path for the directory or file. To specify multiple directories or files, separate them by Commas(,). Eg. docs/EN_US,docs/PT_BR
	default: "docs/"
	required: false
	file-type:
	type: string
	description: Specify the files which need to be scanned (md/xml/rst). To specify multiple file types separate them by Commas(,). Eg. xml,md
	default: 'xml'
	required: false
	debug-mode:
	type: boolean
	description: Run in Debug mode to upload all created files
	default: false
	required: false

	workflow_dispatch:
	inputs:
	file-path:
	type: string
	description: Specify the path for the directory or file. To specify multiple directories or files, separate them by Commas(,). Eg. docs/EN_US,devdoc/
	default: "/"
	required: false
	file-type:
	type: string
	description: Specify the files which need to be scanned (md/xml/rst). To specify multiple file types separate them by Commas(,). Eg. xml,md
	default: 'xml,md,rst'
	required: false
	debug-mode:
	type: boolean
	description: Run in Debug mode to upload all created files
	default: false
	required: false

	jobs:
	main:
	runs-on: ubuntu-22.04
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	repository: hpcc-systems/HPCC-Platform
	fetch-depth: 2

	- name: List Documentation files
	run: \|
	# Determine the event type that triggered this workflow
	# When a workflow is triggered by `workflow_call`, it doesn't explicitly provide
	# the event type of the call. Instead, it shares the event context of the calling workflow.
	# To identify if the workflow was triggered by `workflow_call`, we use an input parameter
	# called `event-type`. If this input is provided, it helps us identify that the workflow was
	# triggered by `workflow_call`. If the input is not present, we use the github.event_name to determine the event.
	if [ -n "${{ inputs.event-type }}" ]; then
	EVENT_TYPE="${{ inputs.event-type }}"
	else
	EVENT_TYPE="${{ github.event_name }}"
	fi
	touch xmlFilesList.txt mdFilesList.txt rstFilesList.txt
	if [[ "${EVENT_TYPE}" == "workflow_dispatch" \|\| "${EVENT_TYPE}" == "workflow_call" ]]; then
	IFS=',' read -a DIR_LIST <<< "${{ inputs.file-path }}"
	IFS=',' read -a FILE_TYPE_LIST <<< "${{ inputs.file-type }}"
	for DIR in ${DIR_LIST[@]}
	do
	DIR=${PWD}/${DIR} #gets the complete path
	DIR=$( realpath ${DIR} ) #gets the actual path ex: HPCC-Platform//docs --> HPCC-Platform/docs
	if [[ -f ${DIR} ]]; then #if the specified path points to a file append it to respective list
	FILE_TYPE=${DIR##*.} #extract the file extension
	echo ${DIR} \| tee -a ${FILE_TYPE}FilesList.txt
	continue
	fi
	for FILE_TYPE in ${FILE_TYPE_LIST[@]}
	do
	FILE_TYPE=${FILE_TYPE#.} #remove leading dot(.) if present
	FILE_TYPE=${FILE_TYPE,,} #convert the FILE_TYPE to lowercase
	find ${DIR} -name "*.${FILE_TYPE}" -type f \| tee -a ${FILE_TYPE}FilesList.txt
	# remove if any duplicate files are present
	sort -u ${FILE_TYPE}FilesList.txt -o ${FILE_TYPE}FilesList.txt
	done
	done
	elif [[ "${EVENT_TYPE}" == "pull_request" ]]; then
	git diff --name-only HEAD^1 HEAD > updatedFiles.txt
	cat updatedFiles.txt \| grep -E "*\.xml" \| tee xmlFilesList.txt
	cat updatedFiles.txt \| grep -E "*\.md" \| tee mdFilesList.txt
	cat updatedFiles.txt \| grep -E "*\.rst" \| tee rstFilesList.txt
	fi

	- name: List links from Documentation files
	run: \|
	IFS=$'\n'
	touch missingFiles.txt
	for FILE in $( cat xmlFilesList.txt )
	do
	#check if the file is missing
	if [[ ! -f $FILE ]]; then
	echo -e "$FILE -\e[31m file missing\e[0m"
	echo $FILE >> missingFiles.txt
	continue
	fi
	grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${FILE} \| sed 's/url="//' > links.tmp
	FLAG=0
	for LINE in $( cat links.tmp )
	do
	LINK=$( echo $LINE \| cut -d ':' -f3- )
	if [[ ${LINK:0:6} == '<ulink' ]]; then
	FLAG=1
	continue
	elif [[ ${LINK:0:8} == '</ulink>' ]]; then
	FLAG=0
	continue
	fi
	if [[ $FLAG -eq 1 ]]; then
	echo $LINE >> linksList.txt
	fi
	done
	done
	for FILE in $( cat mdFilesList.txt )
	do
	#check if the file is missing
	if [[ ! -f $FILE ]]; then
	echo -e "$FILE -\e[31m file missing\e[0m"
	echo $FILE >> missingFiles.txt
	continue
	fi
	grep -onHE -e "\]$[^$]+" -e "\`\`\`" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} \| sed 's/](//' > links.tmp
	FLAG=0
	for LINE in $( cat links.tmp )
	do
	LINK=$( echo $LINE \| cut -d ':' -f3- )
	if [[ ${LINK:0:3} == "\`\`\`" ]]; then
	FLAG=$(( 1 - FLAG ))
	continue
	fi
	if [[ $FLAG -eq 0 ]]; then
	echo $LINE >> linksList.txt
	fi
	done
	done

	for FILE in $( cat rstFilesList.txt )
	do
	#check if the file is missing
	if [[ ! -f $FILE ]]; then
	echo -e "$FILE -\e[31m file missing\e[0m"
	echo $FILE >> missingFiles.txt
	continue
	fi
	grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} \| sed 's/.. _[^\:]*: //' >> linksList.txt
	done

	if [[ -f linksList.txt ]]; then
	echo "External links: "
	cat linksList.txt \| grep -vE '127.0.0.1\|localhost\|\$\|\[' \| grep -E 'https://\|http://' \| tee externalLinks.txt
	echo -e "\nInternal links: "
	cat linksList.txt \| grep -vE '127.0.0.1\|localhost\|\$\|\[' \| grep -vE 'https://\|http://' \| tee internalLinks.txt
	fi

	- name: Test External links
	run: \|
	touch checkedLinksCache.txt
	IFS=$'\n'
	if [[ -f externalLinks.txt ]]; then
	for LINE in $(cat externalLinks.txt )
	do
	LINK=$( echo $LINE \| cut -d ':' -f3- )
	LINK=${LINK%.} #remove trailing dot(.)
	LINK=${LINK% } #remove trailing space
	CHECK_CACHE=$( cat checkedLinksCache.txt \| grep "$LINK~" \| wc -w )
	TRY=3 #Max attempts to check status code of hyperlinks
	if [[ $CHECK_CACHE -eq 0 ]]; then
	while [[ $TRY -ne 0 ]]
	do
	HTTP_RESPONSE_CODE=$( curl -o /dev/null -m 60 -sL -w "%{response_code}" $LINK ) \|\| true
	if [[ $HTTP_RESPONSE_CODE -ne 0 ]]; then
	echo "$LINK~$HTTP_RESPONSE_CODE" >> checkedLinksCache.txt
	break
	else
	echo $LINE
	echo "retrying..."
	TRY=$(( TRY - 1))
	fi
	done
	else
	HTTP_RESPONSE_CODE=$( cat checkedLinksCache.txt \| grep "$LINK~" \| cut -d '~' -f2 )
	fi
	if [[ $HTTP_RESPONSE_CODE -eq 404 ]]; then
	echo -e "${LINK} - \e[31m404 Error\e[0m"
	echo "${LINE}" >> error-report.log
	elif [[ $HTTP_RESPONSE_CODE -eq 0 ]]; then
	HTTP_ERROR_MESSAGE=$( curl -o /dev/null -m 60 -sSL $LINK 2>&1) \|\| true
	echo -e "${LINK} - \e[31m${HTTP_ERROR_MESSAGE}\e[0m"
	HTTP_ERROR_MESSAGE=$( echo $HTTP_ERROR_MESSAGE \| sed 's/ /-/g' )
	echo "${LINE}(${HTTP_ERROR_MESSAGE})" >> error-report.log
	else
	echo "${LINK} - ${HTTP_RESPONSE_CODE}"
	fi
	done
	fi

	- name: Test Internal Links
	run: \|
	if [[ -f internalLinks.txt ]]; then
	for LINE in $( cat internalLinks.txt )
	do
	REFERENCE=$( echo $LINE \| cut -d ':' -f3- )
	FILE=$( echo $LINE \| cut -d ':' -f1 )
	if [[ ${REFERENCE:0:1} == '#' ]]; then
	LINK_TEXT=$( cat $FILE \| grep -oE "\[.*\]$${REFERENCE}$" \| sed 's/\[//' \| cut -d ']' -f1 )
	IS_PRESENT=$(cat $FILE \| grep -oE "# ${LINK_TEXT}" \| wc -w)
	if [[ $IS_PRESENT -eq 0 ]]; then
	echo -e "${LINE} -\e[31m invalid reference\e[0m"
	echo "${LINE}" >> error-report.log
	else
	echo -e "${LINE} -\e[32m valid reference\e[0m"
	fi
	else
	if [[ ${REFERENCE:0:1} == '/' ]]; then
	BASE_DIR=$PWD
	else
	BASE_DIR=${FILE/$( basename $FILE )}
	fi
	SEARCH_PATH="$BASE_DIR/${REFERENCE}"
	SEARCH_PATH=$( realpath $SEARCH_PATH )
	# if it is neither a valid file nor valid a directory, then it is an invalid reference
	if [[ ! -f $SEARCH_PATH && ! -d $SEARCH_PATH ]]; then
	echo -e "${LINE} -\e[31m invalid reference\e[0m"
	echo ${LINE/$REFERENCE/$SEARCH_PATH} >> error-report.log
	else
	echo -e "${LINE} -\e[32m valid reference\e[0m"
	fi
	fi
	done
	fi

	- name: Report Error links
	run: \|
	if [[ -f error-report.log ]]; then
	NUMBER_OF_404_LINKS=$( cat error-report.log \| wc -l )
	fi
	echo -e "\e[32mNo. of files scanned : $( cat *FilesList.txt \| wc -l )\e[0m"
	if [[ $NUMBER_OF_404_LINKS -ne 0 ]]; then
	echo -e "\e[31mNo. of unique broken links : $( cat error-report.log \| cut -d: -f3- \| sort \| uniq \| wc -l )\e[0m"
	echo -e "\e[31mTotal No. of reference to broken links : $( cat error-report.log \| cut -d: -f3- \| sort \| wc -l )\e[0m"
	echo "Checkout the log artifact in the summary page for more details about the broken links."
	echo "Note: If any of the reported broken links are just example links or placeholders and are not valid links, please enclose them in triple backticks to ignore them."
	echo "For example: \`\`\`https://This/is/not/a/valid/link.com\`\`\`"
	exit -1
	else
	echo -e "\e[32mNo Broken-links found\e[0m"
	fi

	- name: Modify log file
	if: ${{ failure() \|\| cancelled() }}
	run: \|
	BASE_DIR=${PWD}
	BASE_DIR=$(echo $BASE_DIR \| sed 's/\//\\\//g')
	sed -i "s/${BASE_DIR}/HPCC-Platform/g" error-report.log
	FILE_NAMES_LIST=$(cat error-report.log \| cut -d ':' -f1 \| sort \| uniq )
	FILE_COUNT=1
	for LINE in $FILE_NAMES_LIST
	do
	LINKS_LIST=$( cat error-report.log \| grep $LINE \| cut -d ':' -f2- )
	echo "$FILE_COUNT. $LINE" >> error-reportTmp.log
	FILE_COUNT=$(( FILE_COUNT + 1))
	for LINK in $LINKS_LIST
	do
	echo -e "\t Line $LINK" \| sed 's/:/ : /' >> error-reportTmp.log
	done
	done
	if [[ $(cat missingFiles.txt \| wc -w ) -eq 0 ]]; then
	echo -e "Broken links: \n" > error-report.log
	cat error-reportTmp.log >> error-report.log
	else
	echo -e "Missing Files:" > error-report.log
	FILE_COUNT=1
	for FILE in $( cat missingFiles.txt )
	do
	echo -e "${FILE_COUNT}. ${FILE}" >> error-report.log
	FILE_COUNT=$(( FILE_COUNT + 1 ))
	done
	echo -e "\nBroken links: \n" >> error-report.log
	cat error-reportTmp.log >> error-report.log
	fi
	if [ -z ${{ inputs.debug-mode }} ]; then
	DEBUG_MODE=false
	else
	DEBUG_MODE=${{ inputs.debug-mode }}
	fi
	if [[ ${{ github.event_name }} == "pull_request" \|\| $DEBUG_MODE == false ]]; then
	rm -rf *FilesList.txt \
	checkedLinksCache.txt \
	*Links.txt \
	linksList.txt
	fi

	- name: Upload logs
	uses: actions/upload-artifact@v4
	if: ${{ failure() \|\| cancelled() \|\| inputs.debug-mode == 'true'}}
	with:
	name: Hyperlinks-testing-log
	path: \|
	./error-report.log
	./*FilesList.txt
	./checkedLinksCache.txt
	./*Links.txt
	./linksList.txt
	if-no-files-found: ignore

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

HPCC-32795 Additional optimizations to replaceString #461

Workflow file

HPCC-32795 Additional optimizations to replaceString #461

Jobs

Run details

Workflow file for this run