-
Notifications
You must be signed in to change notification settings - Fork 0
/
main2.pl
executable file
·95 lines (84 loc) · 2.14 KB
/
main2.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#! /usr/bin/perl
use warnings;
use strict;
use WWW::Mechanize;
use HTML::TreeBuilder::XPath;
sub scraper(){
# Get list of files already downloaded
my $dir = $_[0];
my $url=$_[1];
my @ls;
unless(-d $dir){
mkdir $dir;
}
opendir(DIR, $dir) or die $!;
while (my $file = readdir(DIR)) {
next if ($file =~ /^\./);
push(@ls, $file);
}
closedir(DIR);
chdir $dir;
# Initialise modules
my $mech=WWW::Mechanize->new();
$mech->get($url);
my $tree=HTML::TreeBuilder::XPath->new();
$tree->parse($mech->content);
my $file; # Working file
my $skip = 0; # To get out of nested loop
my $localsize; # File size on disk
my $size; # File size on server
my $href=1 # Set to higher value to skip any
# additional href links above files
my $workingnode;# XPath node for pre
my @nodes = $tree->findnodes('/html/body');
my @hrefnodes = $tree->findnodes('/html/body/pre/a');
$workingnode = $nodes[0]->findvalue( 'pre');
# Parse lines to get file sizes
# this regex could be simplified but it allows easy grabbing of more
# data if anyone ever customises this script
while($workingnode =~ s/(.+)\s+\d\d-\w{3}-\d{4} \d\d:\d\d\s+(\S+)//){
#we must use href or we can't see long file names
if($1 =~ /Parent Directory/){next;}
$skip = 0;
$file = $hrefnodes[$href++]->findvalue('@href');
$size = $2;
$file =~ s/^\s+(.+)/$1/;
$file =~ s/(\S+)\s+$/$1/;
if($size =~ /-/){ #directories
&scraper("$dir/$file","$url/$file");
chdir "..";
next;
}
foreach(@ls){
# Don't redownload anything we already have
# unless it's bigger on the server as we may
# have previously downloaded an incomplete file
if($file =~ /$_$/){
$localsize = -s $file;
if($size>$localsize){
`rm $file`;
`rm $file.*`;
#print "we already have $file, locally";
#print "it is $localsize bytes and on";
#print "the server it is $size bytes\n";
}
else{
$skip++;
}
}
}
unless($skip){
`axel $url/$file`;
}
}
}
my $dir;
my $url;
if(defined($ARGV[1]) && defined($ARGV[2])){
$url = $ARGV[1];
$dir = $ARGV[2];
}else{
$dir = '/home/jon/dev/Index-Scraper';
$url = "http://templeos.org/Videos";
}
&scraper($dir,$url);;