-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
154 lines (123 loc) · 2.6 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
package main
import (
"crawler/models"
"fmt"
// "io"
"io/ioutil"
"net/http"
// "os"
"regexp"
)
var (
crawled map[string]bool
)
func init() {
fmt.Println("crawled is initializing...")
crawled = make(map[string]bool)
fmt.Println("crawled is initialized...")
}
const N = 800
func main() {
Crawl("http://www.qq.com")
}
func Crawl(s string) {
fmt.Println("toCrawl is initializing....")
toCrawl := make(chan string, 4096)
fmt.Println("toCrawl is initialized......")
// crawled = make(map[string]bool)
toCrawl <- s
fmt.Printf("append %s to toCrawl\n", s)
c := make(chan int, N)
for i := 0; i < N; i++ {
fmt.Println("start a goroutine %d", i)
go Download(toCrawl, c)
}
for i := 0; i < N; i++ {
fmt.Println("end a goroutine %d", i)
<-c
}
}
func Download(chs chan string, c chan<- int) {
for url := range chs {
fmt.Println("Get the url: %s", url)
resp, err := http.Get(url)
if err != nil {
fmt.Println("http.Get() error:", err)
continue
}
fmt.Println("http.Get() is succussful...")
// io.Copy(os.Stdout, resp.Body)
defer resp.Body.Close()
content, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("Read the Body error: ", err)
continue
}
// fmt.Printf("Read %d bytes from body\n", cnt)
// fmt.Println("Get the content: ", string(content))
if err != nil {
fmt.Println("failed to read url: %s", url)
return
}
rgx := "<title>(.*)</title>"
reg := regexp.MustCompile(rgx)
title := reg.Find(content)
models.Add(trim_title(string(title)), url)
crawled[url] = true
go func() {
rgx := "href=\"(.*?)\""
reg := regexp.MustCompile(rgx)
urls := reg.FindAll(content, -1)
for _, l := range urls {
ref := trim1(string(l))
// ref := string(l)
if _, ok := crawled[ref]; ok {
continue
}
chs <- ref
}
}()
}
c <- 1
}
func trim_title(title string) string {
arr := []rune(title)
if len(arr) < 15 {
fmt.Println("title is null")
return ""
}
return string(arr[7 : len(arr)-8])
}
func trim1(l string) string {
arr := []rune(l)
return string(arr[6 : len(arr)-1])
}
func trim(l string) string {
arr := []rune(l)
start := 0
for i := range arr {
if i == '"' {
break
}
start++
}
end := len(arr) - 1
for ; end >= 0; end-- {
if arr[end] == '"' {
break
}
}
arr = arr[start+1 : end]
return string(arr)
}
// func RetrieveLinks(content []byte, chs chan string) {
// rgx := "href=\"(.*?)\""
// reg := regexp.MustCompile(rgx)
// urls := reg.FindAll(content, -1)
// for _, l := range urls {
// if _, ok := crawled[string(l)]; ok {
// continue
// }
// chs <- string(l)
// }
// }