Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Article search - first searcher #3

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ Papeeria is an IDE for your computer science papers

Article search system for Papeeria

Crawling documents:
-------------------

Class Crawler creates index database for all articles' abstracts from the ACM Computing Surveys (CSUR) journal.
Create search index database:
./crawl_acm

Searching:
----------

Class Searcher searchs top n documents by the text. Start searching:
./start_searching

This script calls method cos_search(...), which takes 2 arguments: text for searching and number of top words in text, using for searching. You can modify both of them.

Method cos_search() implements searcher using tf-idf for counting top words in the text and cosine similarity to find appropriate documents.
4 changes: 3 additions & 1 deletion crawl_acm
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import search
c = search.Crawler('db_acm')
c.create_index_tables()
c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603")
c.crawl("http://dl.acm.org/pub.cfm?id=J204", "ACM Computing Surveys (CSUR)")
c.count_idf()
c.count_vectors_length()


200 changes: 178 additions & 22 deletions search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,18 @@
import urllib2
from BeautifulSoup import *
from pysqlite2 import dbapi2 as sqlite
import math

ignore_words = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])


class Crawler:
ABSTRACT_TAB_NAME = "tab_abstract"
TABLE_OF_CONTENTS_TAB_NAME = "tab_about"
ARCHIVE_TAB_NAME = "pub_series"
BASE = "http://dl.acm.org/"
IS_PAPER_LINK = "citation"
ABS_NOT_AVAILABLE = "An abstract is not available."

def __init__(self, db_name):
self.con = sqlite.connect(db_name)

Expand Down Expand Up @@ -53,6 +60,33 @@ def get_entry_id(self, table, field, value, create_new=True):
else:
return res[0]

def get_entry_id_url_list(self, url, title, authors, issue_id, create_new=True):
""" Return id of row in table if this row exists
Else create this row and return id for url"""
cur = self.con.execute(
"select rowid from url_list where url = '%s'" % url)
res = cur.fetchone()
if res is None:
cur = self.con.execute(
"insert into url_list (url, title, authors, issue_id) values ('%s', '%s', '%s', '%s')"
% (url, title, authors, issue_id))
return cur.lastrowid
else:
return res[0]

def get_entry_id_issue(self, url, name, jour_id, create_new=True):
""" Return id of row in table if this row exists
Else create this row and return id for issue"""
cur = self.con.execute(
"select rowid from issue where url = '%s'" % url)
res = cur.fetchone()
if res is None:
cur = self.con.execute(
"insert into issue (jour_id, name, url) values ('%s', '%s', '%s')" % (jour_id, name, url))
return cur.lastrowid
else:
return res[0]

def get_text_only(self, soup):
""" Return text from Soup of page"""
v = soup.string
Expand All @@ -71,20 +105,32 @@ def separate_words(self, text):
splitter = re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s != '']

def add_to_index(self, url, text):
def add_to_index(self, url, text, title, authors, count, issue_id):
""" Add all words from text (from url) to database.
This url becomes indexed """
if self.is_indexed(url):
return
print 'Indexing %s' % url
print '%4d Indexing %s' % (count, url)

if (title is None) and (text is None):
print "Neither text nor title are available"
return

words = []
if title is not None:
words = self.separate_words(title)

if (len(text) < 50) and (self.ABS_NOT_AVAILABLE in text):
print self.ABS_NOT_AVAILABLE
else:
words_from_abstract = self.separate_words(text)
for word in words_from_abstract:
words.append(word)

words = self.separate_words(text)
url_id = self.get_entry_id('url_list', 'url', url)
url_id = self.get_entry_id_url_list(url, title, authors, issue_id)

for i in range(len(words)):
word = words[i]
if word in ignore_words:
continue
word_id = self.get_entry_id('word_list', 'word', word)
#print word_id
self.con.execute(
Expand Down Expand Up @@ -131,45 +177,155 @@ def get_list_of_links(self, url):
links = soup('a')
return links

def get_title(self, url):
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
try:
con = urllib2.urlopen(req)
except:
print "I can't get title of: %s" % url
return
soup = BeautifulSoup(con.read())

authors = soup.findAll(attrs={"name":"citation_authors"})
if len(authors) > 0:
authors = authors[0]['content']
else:
authors = "Authors are unknown"
#print "Authors: %s" % authors[0]['content']
title = soup.findAll(attrs={"name":"citation_title"})
if len(title) > 0:
title = title[0]['content']
else:
title = "Title is unknown"
#print "Title: %s" % title[0]['content']
return title, authors


def get_abstract_text(self, url):
""" Return text of article's abstract"""
req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
try:
con = urllib2.urlopen(req)
except:
print "I can't open %s" % url
print "I can't open abstract: %s" % url
return
soup = BeautifulSoup(con.read())
text = self.get_text_only(soup)
#print text
return text

def crawl(self, journal_url, depth=2):
def delete_user_info(self, url):
"""Delete user info from url"""
ind = url.find('&')
new_url = url[0: ind]
return new_url

def crawl(self, journal_url, name, depth=2):
""" Begin crawling journal in ACM Library """
base = "http://dl.acm.org/"
link = self.open_tab(journal_url, "pub_series")

print " Journal link: " + journal_url
journal_id = self.get_entry_id('journal', 'name', name)

link = self.open_tab(journal_url, self.ARCHIVE_TAB_NAME)
if link is None:
return
archive_url = base + link
archive_url = self.BASE + link
links = self.get_list_of_links(archive_url)
if links is None:
return

count = 1
for link in links:
print "Journal link: " + base + link['href']
list_vol = self.open_tab(base + link['href'], "tab_about")
list_of_papers = self.get_list_of_links(base + list_vol)
info = link.string

#DEBUG
#if count > 20:
# break
if not (link['href'].startswith("citation")):
continue

ref = self.delete_user_info(link['href'])
issue_id = self.get_entry_id_issue(self.BASE + ref, info, journal_id)

print "=============="
print " Issue link: " + self.BASE + ref
print "=============="
list_vol = self.open_tab(self.BASE + ref, self.TABLE_OF_CONTENTS_TAB_NAME)
list_of_papers = self.get_list_of_links(self.BASE + list_vol)

for paper in list_of_papers:
if len(dict(paper.attrs)) == 1:
paper_abstract = self.open_tab(base + paper['href'], "tab_abstract")
text = self.get_abstract_text(base + paper_abstract)
self.add_to_index(base + paper['href'], text)
#DEBUG
#if count > 20:
# break
paper_ref = self.delete_user_info(paper['href'])

if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)):
ref = self.BASE + paper_ref
is_already_indexed = self.con.execute("select rowid from url_list where url = '%s'" %
ref).fetchone()
if is_already_indexed is not None:
print "%4d %s is already indexed" % (count, ref)
count += 1
continue

paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME)
if paper_abstract is None:
continue
text = self.get_abstract_text(self.BASE + paper_abstract)
meta = self.get_title(self.BASE + paper_ref)

self.add_to_index(self.BASE + paper_ref, text, meta[0], meta[1], count, issue_id)
count += 1
self.db_commit()

print "%4d papers were indexed" % (count - 1)


def count_idf(self):
'''Count idf for each word
Set this value to the table word_list'''
print "Counting idf..."
url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0]
words_urls = self.con.execute("select word_id, count(distinct url_id) from word_location "
"group by word_id").fetchall()

for pair in words_urls:
word_id = pair[0]
num = pair[1]
idf = math.log10(url_count / num)
self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id))
self.db_commit()


def count_vectors_length(self):
'''Count vector's length for each url (Euclidean norm of tf * idf for each word in url)
Set this value to the table url_list'''
print "Counting lengths..."

url_count = self.con.execute("select url_id, sum(wcount), sum(count_idf) from "
"(select url_id, count(location) as wcount, "
"count(location) * count(location) * word_list.idf * word_list.idf as count_idf "
"from word_location join word_list on word_location.word_id=word_list.rowid "
"group by url_id, word_id) T1 "
"group by T1.url_id").fetchall()

for url_record in url_count:
length = math.sqrt(url_record[2])
length = length / url_record[1]
self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_record[0]))

self.db_commit()


def create_index_tables(self):
""" Create database tables """
self.con.execute('create table url_list(url)')
self.con.execute('create table word_list(word)')
res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone()
if res is not None:
return

self.con.execute('create table url_list(url, length, title, authors, issue_id)')
self.con.execute('create table issue(jour_id, name, url)')
self.con.execute('create table journal(name)')
self.con.execute('create table word_list(word, idf)')
self.con.execute('create table word_location(url_id, word_id, location)')
self.con.execute('create table link(from_id integer, to_id integer)')
self.con.execute('create table link_words(word_id, link_id)')
Expand Down
Loading