-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurltitle_canonical.sh
executable file
·96 lines (81 loc) · 2.02 KB
/
urltitle_canonical.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#! /bin/bash
if [ -z "$DEBUG" ]; then
DEBUG=false
fi
URL="$1"
if [ -n $bot32 ]; then
useragent="bot32/$bot32 (http://github.com/GinjaNinja32/bot32; [email protected])"
else
useragent="bot32/unspecified (http://github.com/GinjaNinja32/bot32; [email protected])"
fi
read_dom () {
local IFS=\>
read -d \< ENTITY CONTENT
}
readit() {
link=""
title=""
while read_dom; do
case "$ENTITY" in
title)
title="$CONTENT"
if [[ "$link" != "" ]]; then
echo "$link"
echo "$title"
exit 0
fi
;;
link\ rel=\"canonical\"\ href=\"*\"*/)
link="${ENTITY#link rel=\"canonical\" href=\"}"
link="${link%\"*/}"
if [[ "$link" =~ ^/.* ]]; then
schema="${URL%%//*}"
domain_path="${URL#*//}"
domain="${domain_path%%/*}"
link="$schema//$domain$link"
fi
if [[ "$title" != "" ]]; then
echo "$link"
echo "$title"
exit 0
fi
;;
*) ;;
esac
done
echo "$URL"
echo "$title"
}
data="$(wget -4 --header="Accept-Language: en-gb, en;q=0.7" --header="User-Agent: $useragent" -qT 10 -O - --save-headers $URL)"
html="$(echo "$data" | grep -aEA999999 '^\s*$')"
datatype="$(echo "$html" 2>/dev/null | file -)"
if $DEBUG; then
echo "datatype: $datatype"
fi
if [[ "$datatype" != *text* ]]; then
echo "datatype is not text"
exit 0
fi
charset="$(echo "$data" | grep -aE '^$|Content-Type:.*charset=.*' | head -n1 | sed -r 's/Content-Type:.*charset=(\S+).*/\1/g')"
if $DEBUG; then
echo "charset: $charset"
fi
if [[ "$charset" == "" ]]; then
charset="$(echo "$html" | grep -aE '<meta [^>]*charset="?[^"]+"?>' | sed -r 's/.*charset="?([^"]+)"?>.*/\1/g' | head -n1)"
if $DEBUG; then
echo "charset: $charset"
fi
fi
if [[ "$charset" == "" ]]; then
ncharset="$(echo "$html" | uchardet)"
if [[ "$ncharset" != "ascii/unknown" ]]; then
charset="$ncharset"
if $DEBUG; then
echo "charset: $charset"
fi
fi
fi
if [[ "$charset" != "" && "$charset" != "utf8" ]]; then
html="$(echo "$html" | iconv -f "$charset" -t utf8 -)"
fi
echo "$html" 2>/dev/null | readit