-
Notifications
You must be signed in to change notification settings - Fork 0
/
2.1News.R
49 lines (41 loc) · 1.48 KB
/
2.1News.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
//TODOs:
1: Dealing with more than 1 page in the headlines
2: Formatting output using HTML
"""
library(stringr)
library(XML)
library(RCurl)
url <- "http://cctv.cntv.cn/lm/xinwenlianbo/"
url <- str_c(url, format(Sys.Date(), "%Y%m%d"), ".shtml")
info <- debugGatherer()
handle <- getCurlHandle(cookiejar
= "",
followlocation = TRUE, autoreferer = TRUE, debugfunc
= info$update,
verbose
= TRUE,
httpheader = list( from
= "[email protected]",
'user-agent' = str_c(R.version$version.string, ", ", R.version$platform)
))
main <- getURL(url, curl = handle, .encoding = "UTF-8")
main <- htmlParse(main)
headlines <- xpathSApply(doc =main, path = "//div[@class='title_list_box_130503']/ul")
links <- c()
for(i in 1: length(headlines)){
temp <- xpathSApply(headlines[[i]], "li/a")
if(length(temp) == 0) break
links <- c(links, sapply(1:length(temp), function(j){
if(str_detect(as.character(xmlValue(temp[[j]])), "\\[视频\\]"))
xmlAttrs(temp[[j]])["href"]
}))
}
text <- sapply(links, function(i){
print(i)
temp <- getURL(i, curl = handle, .encoding = "UTF-8")
temp <- htmlParse(temp)
xpathSApply(temp, path = "//div[@id='content_body']", fun = xmlValue)
})
for(i in text)
cat(i)