forked from amrrs/HN_scraper_in_R
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hn_scraper_JB.R
51 lines (30 loc) · 1.16 KB
/
hn_scraper_JB.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
library(rvest)
url <- 'https://news.ycombinator.com/'
?rvest::read_xml.response
content <- read_html(url)
#News Title
title <- content %>% html_nodes('a.storylink') %>% html_text()
#News Link Domain
link_domain <- content %>% html_nodes('span.sitestr') %>% html_text()
#Link Score / Upvote
score <- content %>% html_nodes('span.score') %>% html_text()
#Link Age (submission time)
age <- content %>% html_nodes('span.age') %>% html_text()
#Final Dataframe
df <- data.frame(title = title, link_domain = link_domain, score = score, age = age)
# JB - FOUTMELDING:
# Error in data.frame(title = title, link_domain = link_domain, score = score, :
# arguments imply differing number of rows: 30, 29
#
# age: 30 entries
# link_domain: 30 entries
# score: 29 entries <- 1 entry te weinig !
# title: 30 entries
#Naive way of extracting the entire page content with this table
#tb <- content %>% html_node('table.itemlist') %>% html_text()
# JB:
df1 <- data.frame(title = title,
link_domain = link_domain,
#score = score,
age = age)
df1