-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.php
141 lines (113 loc) · 4.33 KB
/
scraper.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
<?php
require 'scraperwiki.php';
require 'scraperwiki/simple_html_dom.php';
// JUST AN EXPERIMENT, JUST A BEGINNING
// Connected to https://github.com/okfn/publicbodies
define('BASE_URL', 'http://www.staatskalender.admin.ch/');
date_default_timezone_set('UTC');
// sqliteexecute not currently supported on Morph.io
// scraperwiki::sqliteexecute('CREATE TABLE IF NOT EXISTS swdata (created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP)');
// First level of hierarchy
foreach (getHTML(BASE_URL . 'welcome.html')->find('a.navLevel2') as $el) {
$entity = array(
'title' => $el->title,
'source_url' => BASE_URL . str_replace(' ', '+', $el->href)
);
create($entity);
}
// Get children of an entity
function getChildren($parent, $html) {
foreach($html->find('div.infoblock a[href^=navigate.html?dn=]') as $childLink) {
$childEntity = array(
'title' => $parent['title'] . ' - ' . trim($childLink->innertext),
'source_url' => BASE_URL . str_replace(' ', '+', $childLink->href),
'parent_key' => $parent['key']
);
create($childEntity);
}
}
// Get details of an entity
function create($entity) {
$html = getHTML($entity['source_url']);
if(!$html) {
return error_log('Could not import ' + $entity['source_url']);
}
// Clean title
$entity['title'] = trim(html_entity_decode($entity['title'], ENT_COMPAT, 'UTF-8'));
// Generate key
$entity['key'] = 'ch/' . makeSlug($entity['title']);
// Extract email
$emailLink = $html->find('table.tabelleESK a[href^=mailto:]', 0);
if($emailLink) {
$entity['email'] = str_replace('mailto:', '', trim($emailLink->href));
}
// Extract URL
foreach($html->find('table.tabelleESK tr') as $row) {
$name = trim($row->children(0)->innertext);
if (stripos('homepage', $name) !== false) {
foreach($row->children() as $i => $cell) {
$link = $cell->find('a[href]', 0);
if ($link) {
$entity['url'] = $link->href;
break;
}
// Ugly case: URL but no link
if($i && !$cell->children()) {
$entity['url'] = $cell->innertext;
break;
}
}
}
}
// Find entity abbreviation
$orgTitle = $html->find('h2.titleOrg', 0);
if ($orgTitle) {
$orgTitleParts = explode('-', $orgTitle->innertext);
if(count($orgTitleParts) > 1) {
$entity['abbr'] = trim(array_pop($orgTitleParts));
$entity['abbr'] = html_entity_decode($entity['abbr'], ENT_COMPAT, 'UTF-8');
}
} else {
error_log('No title on ' + $entity['source_url']);
}
// Find entity address
$infoBlock = $html->find('div.infoblock', 0)->innertext;
if(preg_match('/<\/h2>(.+?)<div class="titleContent">/s', $infoBlock, $matches)) {
$infoBlock = str_replace(array('<br>', '(neues Fenster)'), array(', ', ''), $matches[1]);
$infoBlock = preg_replace('/\s\s+/', ' ', $infoBlock);
$infoBlock = trim(html_entity_decode(strip_tags($infoBlock), ENT_COMPAT, 'UTF-8'));
$entity['address'] = $infoBlock;
}
save($entity);
getChildren($entity, $html);
}
// Store an entity
function save($entity) {
$entity['updated_at'] = date('Y-m-d H:i:s');
$entity['jurisdiction_code'] = 'CH';
$entity['jurisdiction'] = 'Switzerland';
$entity['source'] = 'Eidgenössischer Staatskalender';
$entity['source_description'] = 'Federal-level entities of the Swiss government';
$entity['category'] = 'Federal';
// Suboptimal, temporary workaround for missing auto-timestamp / sqliteexecute (see above).
$entity['created_at'] = $entity['updated_at'];
if(!isset($entity['url'])) {
$entity['url'] = $entity['source_url'];
}
return @scraperwiki::save_sqlite(array('key'), $entity);
}
// Create an URL-friendly identifier
function makeSlug($str) {
return
preg_replace('/\W+/', '-',
strtr(
strtolower(trim($str)),
array('ä' => 'ae', 'ö' => 'oe', 'ü' => 'ue', 'à' => 'a', 'è' => 'e', 'é' => 'e') // no iconv locales on Scraperwiki ..?
)
);
}
// Get DOM from URL
function getHTML($url) {
return str_get_html(scraperwiki::scrape($url));
}
?>