-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
executable file
·183 lines (161 loc) · 6.73 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Copyright 2013 Elizabeth Shashkova
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import urllib2
from BeautifulSoup import *
from pysqlite2 import dbapi2 as sqlite
ignore_words = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
class Crawler:
def __init__(self, db_name):
self.con = sqlite.connect(db_name)
def __del__(self):
self.con.close()
def db_commit(self):
self.con.commit()
def is_indexed(self, url):
""" Check if url is already exist in database.
That means this page is already indexed"""
u = self.con.execute(
"select * from word_location join url_list on "
"url_list.rowid = word_location.url_id where url = '%s'" % url).fetchone()
if u is not None:
return True
return False
def add_link_ref(self, url_from, url_to, link_text):
pass
def get_entry_id(self, table, field, value, create_new=True):
""" Return id of row in table if this row exists
Else create this row and return id"""
cur = self.con.execute(
"select rowid from %s where %s = '%s'" % (table, field, value))
res = cur.fetchone()
if res is None:
cur = self.con.execute(
"insert into %s (%s) values ('%s')" % (table, field, value))
return cur.lastrowid
else:
return res[0]
def get_text_only(self, soup):
""" Return text from Soup of page"""
v = soup.string
if v is None:
c = soup.contents
result_text = ''
for t in c:
sud_text = self.get_text_only(t)
result_text += sud_text + '\n'
return result_text
else:
return v.strip()
def separate_words(self, text):
""" Separate words in text"""
splitter = re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s != '']
def add_to_index(self, url, text):
""" Add all words from text (from url) to database.
This url becomes indexed """
if self.is_indexed(url):
return
print 'Indexing %s' % url
words = self.separate_words(text)
url_id = self.get_entry_id('url_list', 'url', url)
for i in range(len(words)):
word = words[i]
if word in ignore_words:
continue
word_id = self.get_entry_id('word_list', 'word', word)
#print word_id
self.con.execute(
"insert into word_location(url_id, word_id, location) values (%d, %d, %d)" % (url_id, word_id, i))
def open_tab(self, url, tab_name):
""" Return link to get content from tab in ACM Library menu.
tab_name is a name of tab"""
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
try:
con = urllib2.urlopen(req)
except:
print "I can't open %s" % url
return
soup = BeautifulSoup(con.read())
scripts = soup.findAll("script")
m = re.compile("'bindExpr\\\\':\[\\\\'[^']*\\\\'\]")
for tag in scripts:
if len(tag.contents) > 0:
script_content = str(tag.contents)
#print script_content
#print "======another string========\n"
matches = m.finditer(script_content)
for expr in matches:
expr_str = expr.group(0)
#print expr_str
if tab_name in expr_str:
begin_ind = expr_str.find(tab_name)
#print expr_str
link = expr_str[begin_ind:(len(expr_str) - 3)]
link = link.replace(";", "")
#print link
return link
def get_list_of_links(self, url):
""" Return all links from url """
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
try:
con = urllib2.urlopen(req)
except:
print "I can't open %s" % url
return
soup = BeautifulSoup(con.read())
links = soup('a')
return links
def get_abstract_text(self, url):
""" Return text of article's abstract"""
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
try:
con = urllib2.urlopen(req)
except:
print "I can't open %s" % url
return
soup = BeautifulSoup(con.read())
text = self.get_text_only(soup)
#print text
return text
def crawl(self, journal_url, depth=2):
""" Begin crawling journal in ACM Library """
base = "http://dl.acm.org/"
link = self.open_tab(journal_url, "pub_series")
if link is None:
return
archive_url = base + link
links = self.get_list_of_links(archive_url)
if links is None:
return
for link in links:
print "Journal link: " + base + link['href']
list_vol = self.open_tab(base + link['href'], "tab_about")
list_of_papers = self.get_list_of_links(base + list_vol)
for paper in list_of_papers:
if len(dict(paper.attrs)) == 1:
paper_abstract = self.open_tab(base + paper['href'], "tab_abstract")
text = self.get_abstract_text(base + paper_abstract)
self.add_to_index(base + paper['href'], text)
self.db_commit()
def create_index_tables(self):
""" Create database tables """
self.con.execute('create table url_list(url)')
self.con.execute('create table word_list(word)')
self.con.execute('create table word_location(url_id, word_id, location)')
self.con.execute('create table link(from_id integer, to_id integer)')
self.con.execute('create table link_words(word_id, link_id)')
self.con.execute('create index word_idx on word_list(word)')
self.con.execute('create index url_idx on url_list(url)')
self.con.execute('create index word_url_idx on word_location(word_id)')
self.con.execute('create index url_to_idx on link(to_id)')
self.con.execute('create index url_from_idx on link(from_id)')
self.db_commit()