-
Notifications
You must be signed in to change notification settings - Fork 2
/
scraper.rb
64 lines (51 loc) · 2.23 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
require 'scraperwiki'
require 'mechanize'
def get_page_elements(page)
elements_in_div = page.search("div.col_4")
links_in_div = elements_in_div.search('a')
return links_in_div
end
def return_only_application_links(links_on_main_div)
links_to_planning_applications =[]
links_on_main_div.each_with_index do |link, index|
#The first two are links to documents, we don't need those
if index > 1
links_to_planning_applications.push(link)
end
end
return links_to_planning_applications
end
def save_one_application(planning_application_page, url)
elements_from_page = planning_application_page.search("div.col_4").children
#On Wednesday July 20, each application was on a page in a single element broken up with <br> tags. For that reason, I've accessed the child elements directly.
on_notice_to_unformatted = elements_from_page[14].inner_text.strip
day, month, year = on_notice_to_unformatted.split("/")
on_notice_to_formatted = "#{year}-#{month}-#{day}"
record = {
"info_url" => url,
"comment_url" => "mailto: #{url}",
"council_reference" => elements_from_page[4].inner_text.strip,
"address" => "#{elements_from_page[2].inner_text.strip}, VIC",
"on_notice_to" => on_notice_to_formatted,
"description" => elements_from_page[9].inner_text.strip,
"date_scraped" => Date.today.to_s
}
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
ScraperWiki.save_sqlite(['council_reference'], record)
puts "Saving record " + record['council_reference']
else
puts "Skipping already saved record " + record['council_reference']
end
end
url = "http://www.brimbank.vic.gov.au/DEVELOPMENT/Planning/Current_Advertised_Applications"
agent = Mechanize.new
first_page = agent.get(url)
#This site has links to each planning application in one giant div. Each application is on a separate page
links_on_main_div = get_page_elements(first_page)
links_to_planning_applications = return_only_application_links(links_on_main_div)
links_to_planning_applications.each do |link|
sleep(1)
application_url = link.attributes['href']
planning_application_page = agent.get(application_url)
save_one_application(planning_application_page, url)
end