-
Notifications
You must be signed in to change notification settings - Fork 304
326 lines (313 loc) · 13.5 KB
/
test-hyperlinks.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
name: Test Hyperlinks
on:
pull_request:
branches:
- "master"
- "candidate-*"
- "!candidate-9.4.*"
- "!candidate-9.2.*"
- "!candidate-9.0.*"
- "!candidate-8.*"
- "!candidate-7.*"
- "!candidate-6.*"
workflow_call:
inputs:
event-type:
type: string
default: "workflow_call"
required: false
file-path:
type: string
description: Specify the path for the directory or file. To specify multiple directories or files, separate them by Commas(,). Eg. docs/EN_US,docs/PT_BR
default: "docs/"
required: false
file-type:
type: string
description: Specify the files which need to be scanned (md/xml/rst). To specify multiple file types separate them by Commas(,). Eg. xml,md
default: 'xml'
required: false
debug-mode:
type: boolean
description: Run in Debug mode to upload all created files
default: false
required: false
workflow_dispatch:
inputs:
file-path:
type: string
description: Specify the path for the directory or file. To specify multiple directories or files, separate them by Commas(,). Eg. docs/EN_US,devdoc/
default: "/"
required: false
file-type:
type: string
description: Specify the files which need to be scanned (md/xml/rst). To specify multiple file types separate them by Commas(,). Eg. xml,md
default: 'xml,md,rst'
required: false
debug-mode:
type: boolean
description: Run in Debug mode to upload all created files
default: false
required: false
jobs:
main:
runs-on: ubuntu-22.04
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
repository: hpcc-systems/HPCC-Platform
fetch-depth: 2
- name: List Documentation files
run: |
# Determine the event type that triggered this workflow
# When a workflow is triggered by `workflow_call`, it doesn't explicitly provide
# the event type of the call. Instead, it shares the event context of the calling workflow.
# To identify if the workflow was triggered by `workflow_call`, we use an input parameter
# called `event-type`. If this input is provided, it helps us identify that the workflow was
# triggered by `workflow_call`. If the input is not present, we use the github.event_name to determine the event.
if [ -n "${{ inputs.event-type }}" ]; then
EVENT_TYPE="${{ inputs.event-type }}"
else
EVENT_TYPE="${{ github.event_name }}"
fi
touch xmlFilesList.txt mdFilesList.txt rstFilesList.txt
if [[ "${EVENT_TYPE}" == "workflow_dispatch" || "${EVENT_TYPE}" == "workflow_call" ]]; then
IFS=',' read -a DIR_LIST <<< "${{ inputs.file-path }}"
IFS=',' read -a FILE_TYPE_LIST <<< "${{ inputs.file-type }}"
for DIR in ${DIR_LIST[@]}
do
DIR=${PWD}/${DIR} #gets the complete path
DIR=$( realpath ${DIR} ) #gets the actual path ex: HPCC-Platform//docs --> HPCC-Platform/docs
if [[ -f ${DIR} ]]; then #if the specified path points to a file append it to respective list
FILE_TYPE=${DIR##*.} #extract the file extension
echo ${DIR} | tee -a ${FILE_TYPE}FilesList.txt
continue
fi
for FILE_TYPE in ${FILE_TYPE_LIST[@]}
do
FILE_TYPE=${FILE_TYPE#.} #remove leading dot(.) if present
FILE_TYPE=${FILE_TYPE,,} #convert the FILE_TYPE to lowercase
find ${DIR} -name "*.${FILE_TYPE}" -type f | tee -a ${FILE_TYPE}FilesList.txt
# remove if any duplicate files are present
sort -u ${FILE_TYPE}FilesList.txt -o ${FILE_TYPE}FilesList.txt
done
done
elif [[ "${EVENT_TYPE}" == "pull_request" ]]; then
git diff --name-only HEAD^1 HEAD > updatedFiles.txt
cat updatedFiles.txt | grep -E "*\.xml" | tee xmlFilesList.txt
cat updatedFiles.txt | grep -E "*\.md" | tee mdFilesList.txt
cat updatedFiles.txt | grep -E "*\.rst" | tee rstFilesList.txt
fi
- name: List links from Documentation files
run: |
IFS=$'\n'
touch missingFiles.txt
for FILE in $( cat xmlFilesList.txt )
do
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\e[31m file missing\e[0m"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e "<ulink" -e 'url="http[^\"\]+' -e "</ulink>" ${FILE} | sed 's/url="//' > links.tmp
FLAG=0
for LINE in $( cat links.tmp )
do
LINK=$( echo $LINE | cut -d ':' -f3- )
if [[ ${LINK:0:6} == '<ulink' ]]; then
FLAG=1
continue
elif [[ ${LINK:0:8} == '</ulink>' ]]; then
FLAG=0
continue
fi
if [[ $FLAG -eq 1 ]]; then
echo $LINE >> linksList.txt
fi
done
done
for FILE in $( cat mdFilesList.txt )
do
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\e[31m file missing\e[0m"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e "\]\([^\)]+" -e "\`\`\`" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} | sed 's/](//' > links.tmp
FLAG=0
for LINE in $( cat links.tmp )
do
LINK=$( echo $LINE | cut -d ':' -f3- )
if [[ ${LINK:0:3} == "\`\`\`" ]]; then
FLAG=$(( 1 - FLAG ))
continue
fi
if [[ $FLAG -eq 0 ]]; then
echo $LINE >> linksList.txt
fi
done
done
for FILE in $( cat rstFilesList.txt )
do
#check if the file is missing
if [[ ! -f $FILE ]]; then
echo -e "$FILE -\e[31m file missing\e[0m"
echo $FILE >> missingFiles.txt
continue
fi
grep -onHE -e ".. _[^\]+" -e "http://[^\ \;\"\'\<\>\,\`\)]+" -e "https://[^\ \;\"\'\<\>\,\`\)]+" ${FILE} | sed 's/.. _[^\:]*: //' >> linksList.txt
done
if [[ -f linksList.txt ]]; then
echo "External links: "
cat linksList.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -E 'https://|http://' | tee externalLinks.txt
echo -e "\nInternal links: "
cat linksList.txt | grep -vE '127.0.0.1|localhost|\$|\[' | grep -vE 'https://|http://' | tee internalLinks.txt
fi
- name: Test External links
run: |
touch checkedLinksCache.txt
IFS=$'\n'
if [[ -f externalLinks.txt ]]; then
for LINE in $(cat externalLinks.txt )
do
LINK=$( echo $LINE | cut -d ':' -f3- )
LINK=${LINK%.} #remove trailing dot(.)
LINK=${LINK% } #remove trailing space
CHECK_CACHE=$( cat checkedLinksCache.txt | grep "$LINK~" | wc -w )
TRY=3 #Max attempts to check status code of hyperlinks
if [[ $CHECK_CACHE -eq 0 ]]; then
while [[ $TRY -ne 0 ]]
do
HTTP_RESPONSE_CODE=$( curl -o /dev/null -m 60 -sL -w "%{response_code}" $LINK ) || true
if [[ $HTTP_RESPONSE_CODE -ne 0 ]]; then
echo "$LINK~$HTTP_RESPONSE_CODE" >> checkedLinksCache.txt
break
else
echo $LINE
echo "retrying..."
TRY=$(( TRY - 1))
fi
done
else
HTTP_RESPONSE_CODE=$( cat checkedLinksCache.txt | grep "$LINK~" | cut -d '~' -f2 )
fi
if [[ $HTTP_RESPONSE_CODE -eq 404 ]]; then
echo -e "${LINK} - \e[31m404 Error\e[0m"
echo "${LINE}" >> error-report.log
elif [[ $HTTP_RESPONSE_CODE -eq 0 ]]; then
HTTP_ERROR_MESSAGE=$( curl -o /dev/null -m 60 -sSL $LINK 2>&1) || true
echo -e "${LINK} - \e[31m${HTTP_ERROR_MESSAGE}\e[0m"
HTTP_ERROR_MESSAGE=$( echo $HTTP_ERROR_MESSAGE | sed 's/ /-/g' )
echo "${LINE}(${HTTP_ERROR_MESSAGE})" >> error-report.log
else
echo "${LINK} - ${HTTP_RESPONSE_CODE}"
fi
done
fi
- name: Test Internal Links
run: |
if [[ -f internalLinks.txt ]]; then
for LINE in $( cat internalLinks.txt )
do
REFERENCE=$( echo $LINE | cut -d ':' -f3- )
FILE=$( echo $LINE | cut -d ':' -f1 )
if [[ ${REFERENCE:0:1} == '#' ]]; then
LINK_TEXT=$( cat $FILE | grep -oE "\[.*\]\(${REFERENCE}\)" | sed 's/\[//' | cut -d ']' -f1 )
IS_PRESENT=$(cat $FILE | grep -oE "# ${LINK_TEXT}" | wc -w)
if [[ $IS_PRESENT -eq 0 ]]; then
echo -e "${LINE} -\e[31m invalid reference\e[0m"
echo "${LINE}" >> error-report.log
else
echo -e "${LINE} -\e[32m valid reference\e[0m"
fi
else
if [[ ${REFERENCE:0:1} == '/' ]]; then
BASE_DIR=$PWD
else
BASE_DIR=${FILE/$( basename $FILE )}
fi
SEARCH_PATH="$BASE_DIR/${REFERENCE}"
SEARCH_PATH=$( realpath $SEARCH_PATH )
# if it is neither a valid file nor valid a directory, then it is an invalid reference
if [[ ! -f $SEARCH_PATH && ! -d $SEARCH_PATH ]]; then
echo -e "${LINE} -\e[31m invalid reference\e[0m"
echo ${LINE/$REFERENCE/$SEARCH_PATH} >> error-report.log
else
echo -e "${LINE} -\e[32m valid reference\e[0m"
fi
fi
done
fi
- name: Report Error links
run: |
if [[ -f error-report.log ]]; then
NUMBER_OF_404_LINKS=$( cat error-report.log | wc -l )
fi
echo -e "\e[32mNo. of files scanned : $( cat *FilesList.txt | wc -l )\e[0m"
if [[ $NUMBER_OF_404_LINKS -ne 0 ]]; then
echo -e "\e[31mNo. of unique broken links : $( cat error-report.log | cut -d: -f3- | sort | uniq | wc -l )\e[0m"
echo -e "\e[31mTotal No. of reference to broken links : $( cat error-report.log | cut -d: -f3- | sort | wc -l )\e[0m"
echo "Checkout the log artifact in the summary page for more details about the broken links."
echo "Note: If any of the reported broken links are just example links or placeholders and are not valid links, please enclose them in triple backticks to ignore them."
echo "For example: \`\`\`https://This/is/not/a/valid/link.com\`\`\`"
exit -1
else
echo -e "\e[32mNo Broken-links found\e[0m"
fi
- name: Modify log file
if: ${{ failure() || cancelled() }}
run: |
BASE_DIR=${PWD}
BASE_DIR=$(echo $BASE_DIR | sed 's/\//\\\//g')
sed -i "s/${BASE_DIR}/HPCC-Platform/g" error-report.log
FILE_NAMES_LIST=$(cat error-report.log | cut -d ':' -f1 | sort | uniq )
FILE_COUNT=1
for LINE in $FILE_NAMES_LIST
do
LINKS_LIST=$( cat error-report.log | grep $LINE | cut -d ':' -f2- )
echo "$FILE_COUNT. $LINE" >> error-reportTmp.log
FILE_COUNT=$(( FILE_COUNT + 1))
for LINK in $LINKS_LIST
do
echo -e "\t Line $LINK" | sed 's/:/ : /' >> error-reportTmp.log
done
done
if [[ $(cat missingFiles.txt | wc -w ) -eq 0 ]]; then
echo -e "Broken links: \n" > error-report.log
cat error-reportTmp.log >> error-report.log
else
echo -e "Missing Files:" > error-report.log
FILE_COUNT=1
for FILE in $( cat missingFiles.txt )
do
echo -e "${FILE_COUNT}. ${FILE}" >> error-report.log
FILE_COUNT=$(( FILE_COUNT + 1 ))
done
echo -e "\nBroken links: \n" >> error-report.log
cat error-reportTmp.log >> error-report.log
fi
if [ -z ${{ inputs.debug-mode }} ]; then
DEBUG_MODE=false
else
DEBUG_MODE=${{ inputs.debug-mode }}
fi
if [[ ${{ github.event_name }} == "pull_request" || $DEBUG_MODE == false ]]; then
rm -rf *FilesList.txt \
checkedLinksCache.txt \
*Links.txt \
linksList.txt
fi
- name: Upload logs
uses: actions/upload-artifact@v4
if: ${{ failure() || cancelled() || inputs.debug-mode == 'true'}}
with:
name: Hyperlinks-testing-log
path: |
./error-report.log
./*FilesList.txt
./checkedLinksCache.txt
./*Links.txt
./linksList.txt
if-no-files-found: ignore