-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspider.php
117 lines (117 loc) · 2.74 KB
/
spider.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
<?
//Crawls webpages starting with a list of seed URLs
//Stores the fetched documents into the Documents database allotting each document a unique ID.
ini_set("max_execution_time",'6000');
require_once("content-compare.php");
require_once("simple_html_dom.php");
require_once("stemmer.php");
function is_similar($page1)
{
$conn = new mysqli("localhost","root","meetsid20","Search_Engine");
$result=$conn->query("select * from Documents");
while($page=$result->fetch_assoc())
{
if(document_similarity($page1,json_decode($page["ContentStem"],true))>=0.9)
{
$conn->close();
return true;
}
}
$conn->close();
return false;
}
function url_exists($url)
{
$headers = @get_headers($url);
if (is_array($headers))
{
$flag1=$flag2=false;
foreach($headers as $x)
{
if(strpos($x,"html")!=false)
{
$flag1=true;
break;
}
if(strpos($x,"200 OK")!=false)
{
$flag2=true;
}
}
return $flag1 && $flag2;
}
else
return false;
}
$conn=new mysqli("localhost","root","meetsid20","Search_Engine");
$result=$conn->query("select * from Crawler_Seeds");
$q=array();
$visited=array();
while($yaH=$result->fetch_assoc())
{
array_push($q,$yaH["URL"]);
$visited[$yaH["URL"]]=1;
}
$count=0;
while(count($q)>0)
{
if($count>=1500)
break;
$URL=$q[count($q)-1];
array_pop($q);
$html = new simple_html_dom();
if(url_exists($URL)==true)
$html=file_get_html($URL);
else
continue;
$title=$html->find("title",0)->innertext;
$content=$html->plaintext;
$content_stem=get_stem_words($content);
$title_stem=get_stem_words($title);
if(is_similar($content_stem)||$content=="")
continue;
$count++;
$stem_content_json=json_encode($content_stem);
$stem_title_json=json_encode($title_stem);
$conn->query("insert into Documents values(null,'$URL','$title','$content','$stem_title_json','$stem_content_json')");
if($html!=NULL && isset($html) && is_object($html) && !empty($html) && isset($html->nodes))
$links=$html->find("a");
else
continue;
if(!empty($links) && $visited[$URL]<3)
{
foreach($links as $link)
{
$new=$link->href;
if($new!=null&&strlen($new)>1)
{
if($new[0] == '#')
continue;
if(strpos($new,"//")===0)
$new="http:".$new;
else if($new[0]=='/')
{
if(strpos($URL,"/",strpos($URL,":")+3)==false)
{
$new=$URL.$new;
}
else
$new=substr($URL,0,strpos($URL,"/",strpos($URL,":")+3)).$new;
}
if(strpos($new,"?")!=false)//Unclean URL elimination
{
$new=substr($new,0,strpos($new,"?"));
}
if(strpos($new,"iiit")===false)
continue;
if(!isset($visited[$new]))
{
$visited[$new]=$visited[$URL]+1;
array_unshift($q, $new);
}
}
}
}
}
$conn->close();
?>