-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.pl
executable file
·121 lines (106 loc) · 2.85 KB
/
scraper.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env perl
# Copyright 2014 Michal Špaček <[email protected]>
# Pragmas.
use strict;
use warnings;
# Modules.
use Database::DumpTruck;
use Encode qw(decode_utf8 encode_utf8);
use English;
use HTML::TreeBuilder;
use LWP::UserAgent;
use POSIX qw(strftime);
use URI;
use Time::Local;
# Constants.
my $DATE_WORD_HR = {
decode_utf8('leden') => 1,
decode_utf8('únor') => 2,
decode_utf8('březen') => 3,
decode_utf8('duben') => 4,
decode_utf8('květen') => 5,
decode_utf8('červen') => 6,
decode_utf8('červenec') => 7,
decode_utf8('srpen') => 8,
decode_utf8('září') => 9,
decode_utf8('říjen') => 10,
decode_utf8('listopad') => 11,
decode_utf8('prosinec') => 12,
};
# Don't buffer.
$OUTPUT_AUTOFLUSH = 1;
# URI of service.
my $base_uri = URI->new('http://www.sever.brno.cz/omezeni-dopravy/92-vykopove-prace.html');
# Open a database handle.
my $dt = Database::DumpTruck->new({
'dbname' => 'data.sqlite',
'table' => 'data',
});
# Create a user agent object.
my $ua = LWP::UserAgent->new(
'agent' => 'Mozilla/5.0',
);
# Get base root.
print 'Page: '.$base_uri->as_string."\n";
my $root = get_root($base_uri);
# Look for items.
my $doc_items = $root->find_by_attribute('class', 'blog');
my @doc = $doc_items->find_by_tag_name('div');
foreach my $doc (@doc) {
my $doc_attr = $doc->attr('class');
if ($doc_attr !~ m/^item\s+column-\d+$/ms) {
next;
}
# Title and start date.
my $title_h2 = $doc->find_by_tag_name('h2');
my $title_a = $title_h2->find_by_tag_name('a');
my $title = $title_a->as_text;
remove_trailing(\$title);
my $link = URI->new($base_uri->scheme.'://'.$base_uri->host.
$title_a->attr('href'));
if (!$title_h2->find_by_attribute('class', 'date')) {
next;
}
my $date_start = get_db_date_word($title_h2
->find_by_attribute('class', 'date')->as_text);
# Description.
my $desc = $doc->find_by_attribute('class', 'article-anot')->as_text;
# TODO Update
print '- '.encode_utf8($title)."\n";
$dt->insert({
'Title' => $title,
'Start_date' => $date_start,
'Description' => $desc,
'Page_link' => $link->as_string,
});
}
# Get database data from word date.
sub get_db_date_word {
my $date_word = shift;
$date_word =~ s/^\s*-\s+//ms;
my ($day, $mon_word, $year) = $date_word =~ m/^\s*(\d+)\.\s*(\w+)\s+(\d+)\s*$/ms;
my $mon = $DATE_WORD_HR->{$mon_word};
my $time = timelocal(0, 0, 0, $day, $mon - 1, $year - 1900);
return strftime('%Y-%m-%d', localtime($time));
}
# Get root of HTML::TreeBuilder object.
sub get_root {
my $uri = shift;
my $get = $ua->get($uri->as_string);
my $data;
if ($get->is_success) {
$data = $get->content;
} else {
die "Cannot GET '".$uri->as_string." page.";
}
my $tree = HTML::TreeBuilder->new;
$tree->parse(decode_utf8($data));
return $tree->elementify;
}
# Removing trailing whitespace.
sub remove_trailing {
my $string_sr = shift;
${$string_sr} =~ s/^\s*//ms;
${$string_sr} =~ s/\s*$//ms;
return;
}