-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper.rb
60 lines (47 loc) · 1.57 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
require 'scraperwiki'
require 'mechanize'
url_base = "https://www.yarracity.vic.gov.au/planning-application-search"
url = url_base + "?suburb=(All)&street=(All)&status=Current&ward=(All)"
def clean_whitespace(a)
a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
end
def get_page_data(page, url_base)
comment_url = "mailto:[email protected]"
trs = page.search('table.search tbody tr')
trs.each do |tr|
texts = tr.search('td').map{|n| n.inner_text}
council_reference = clean_whitespace(texts[0])
info_url = url_base + "?applicationNumber=#{council_reference}"
record = {
'info_url' => info_url,
'comment_url' => comment_url,
'council_reference' => council_reference,
'date_received' => Date.parse(texts[1]).to_s,
'address' => clean_whitespace(texts[2]),
'description' => clean_whitespace(texts[3]),
'date_scraped' => Date.today.to_s
}
begin
record["on_notice_from"] = Date.parse(texts[4]).to_s
rescue
# In case the date is invalid
end
puts "Saving record " + council_reference + " - " + record['address']
# puts record
ScraperWiki.save_sqlite(['council_reference'], record)
end
end
agent = Mechanize.new
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
page = agent.get url
begin
get_page_data(page, url_base)
# Click on the link to the next page
links = page.search('div.pagination-container').search('a')
link = links.find{|a| a.inner_text == 'Next'}
if link
puts url_base + link["href"]
page = agent.get (url_base + link["href"])
end
# end
end while link