diff --git a/README.md b/README.md index c964e68..d119c72 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,19 @@ Papeeria is an IDE for your computer science papers Article search system for Papeeria +Crawling documents: +------------------- + Class Crawler creates index database for all articles' abstracts from the ACM Computing Surveys (CSUR) journal. Create search index database: ./crawl_acm + +Searching: +---------- + +Class Searcher searchs top n documents by the text. Start searching: +./start_searching + +This script calls method cos_search(...), which takes 2 arguments: text for searching and number of top words in text, using for searching. You can modify both of them. + +Method cos_search() implements searcher using tf-idf for counting top words in the text and cosine similarity to find appropriate documents. diff --git a/crawl_acm b/crawl_acm index 8ca0a7e..3e1fb71 100755 --- a/crawl_acm +++ b/crawl_acm @@ -3,6 +3,8 @@ import search c = search.Crawler('db_acm') c.create_index_tables() -c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603") +c.crawl("http://dl.acm.org/pub.cfm?id=J204", "ACM Computing Surveys (CSUR)") +c.count_idf() +c.count_vectors_length() diff --git a/search.py b/search.py index 506dc75..650347c 100755 --- a/search.py +++ b/search.py @@ -13,11 +13,18 @@ import urllib2 from BeautifulSoup import * from pysqlite2 import dbapi2 as sqlite +import math -ignore_words = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class Crawler: + ABSTRACT_TAB_NAME = "tab_abstract" + TABLE_OF_CONTENTS_TAB_NAME = "tab_about" + ARCHIVE_TAB_NAME = "pub_series" + BASE = "http://dl.acm.org/" + IS_PAPER_LINK = "citation" + ABS_NOT_AVAILABLE = "An abstract is not available." + def __init__(self, db_name): self.con = sqlite.connect(db_name) @@ -53,6 +60,33 @@ def get_entry_id(self, table, field, value, create_new=True): else: return res[0] + def get_entry_id_url_list(self, url, title, authors, issue_id, create_new=True): + """ Return id of row in table if this row exists + Else create this row and return id for url""" + cur = self.con.execute( + "select rowid from url_list where url = '%s'" % url) + res = cur.fetchone() + if res is None: + cur = self.con.execute( + "insert into url_list (url, title, authors, issue_id) values ('%s', '%s', '%s', '%s')" + % (url, title, authors, issue_id)) + return cur.lastrowid + else: + return res[0] + + def get_entry_id_issue(self, url, name, jour_id, create_new=True): + """ Return id of row in table if this row exists + Else create this row and return id for issue""" + cur = self.con.execute( + "select rowid from issue where url = '%s'" % url) + res = cur.fetchone() + if res is None: + cur = self.con.execute( + "insert into issue (jour_id, name, url) values ('%s', '%s', '%s')" % (jour_id, name, url)) + return cur.lastrowid + else: + return res[0] + def get_text_only(self, soup): """ Return text from Soup of page""" v = soup.string @@ -71,20 +105,32 @@ def separate_words(self, text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s != ''] - def add_to_index(self, url, text): + def add_to_index(self, url, text, title, authors, count, issue_id): """ Add all words from text (from url) to database. This url becomes indexed """ if self.is_indexed(url): return - print 'Indexing %s' % url + print '%4d Indexing %s' % (count, url) + + if (title is None) and (text is None): + print "Neither text nor title are available" + return + + words = [] + if title is not None: + words = self.separate_words(title) + + if (len(text) < 50) and (self.ABS_NOT_AVAILABLE in text): + print self.ABS_NOT_AVAILABLE + else: + words_from_abstract = self.separate_words(text) + for word in words_from_abstract: + words.append(word) - words = self.separate_words(text) - url_id = self.get_entry_id('url_list', 'url', url) + url_id = self.get_entry_id_url_list(url, title, authors, issue_id) for i in range(len(words)): word = words[i] - if word in ignore_words: - continue word_id = self.get_entry_id('word_list', 'word', word) #print word_id self.con.execute( @@ -131,45 +177,155 @@ def get_list_of_links(self, url): links = soup('a') return links + def get_title(self, url): + req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"}) + try: + con = urllib2.urlopen(req) + except: + print "I can't get title of: %s" % url + return + soup = BeautifulSoup(con.read()) + + authors = soup.findAll(attrs={"name":"citation_authors"}) + if len(authors) > 0: + authors = authors[0]['content'] + else: + authors = "Authors are unknown" + #print "Authors: %s" % authors[0]['content'] + title = soup.findAll(attrs={"name":"citation_title"}) + if len(title) > 0: + title = title[0]['content'] + else: + title = "Title is unknown" + #print "Title: %s" % title[0]['content'] + return title, authors + + def get_abstract_text(self, url): """ Return text of article's abstract""" req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"}) try: con = urllib2.urlopen(req) except: - print "I can't open %s" % url + print "I can't open abstract: %s" % url return soup = BeautifulSoup(con.read()) text = self.get_text_only(soup) - #print text return text - def crawl(self, journal_url, depth=2): + def delete_user_info(self, url): + """Delete user info from url""" + ind = url.find('&') + new_url = url[0: ind] + return new_url + + def crawl(self, journal_url, name, depth=2): """ Begin crawling journal in ACM Library """ - base = "http://dl.acm.org/" - link = self.open_tab(journal_url, "pub_series") + + print " Journal link: " + journal_url + journal_id = self.get_entry_id('journal', 'name', name) + + link = self.open_tab(journal_url, self.ARCHIVE_TAB_NAME) if link is None: return - archive_url = base + link + archive_url = self.BASE + link links = self.get_list_of_links(archive_url) if links is None: return + count = 1 for link in links: - print "Journal link: " + base + link['href'] - list_vol = self.open_tab(base + link['href'], "tab_about") - list_of_papers = self.get_list_of_links(base + list_vol) + info = link.string + + #DEBUG + #if count > 20: + # break + if not (link['href'].startswith("citation")): + continue + + ref = self.delete_user_info(link['href']) + issue_id = self.get_entry_id_issue(self.BASE + ref, info, journal_id) + + print "==============" + print " Issue link: " + self.BASE + ref + print "==============" + list_vol = self.open_tab(self.BASE + ref, self.TABLE_OF_CONTENTS_TAB_NAME) + list_of_papers = self.get_list_of_links(self.BASE + list_vol) + for paper in list_of_papers: - if len(dict(paper.attrs)) == 1: - paper_abstract = self.open_tab(base + paper['href'], "tab_abstract") - text = self.get_abstract_text(base + paper_abstract) - self.add_to_index(base + paper['href'], text) + #DEBUG + #if count > 20: + # break + paper_ref = self.delete_user_info(paper['href']) + + if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)): + ref = self.BASE + paper_ref + is_already_indexed = self.con.execute("select rowid from url_list where url = '%s'" % + ref).fetchone() + if is_already_indexed is not None: + print "%4d %s is already indexed" % (count, ref) + count += 1 + continue + + paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME) + if paper_abstract is None: + continue + text = self.get_abstract_text(self.BASE + paper_abstract) + meta = self.get_title(self.BASE + paper_ref) + + self.add_to_index(self.BASE + paper_ref, text, meta[0], meta[1], count, issue_id) + count += 1 self.db_commit() + print "%4d papers were indexed" % (count - 1) + + + def count_idf(self): + '''Count idf for each word + Set this value to the table word_list''' + print "Counting idf..." + url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] + words_urls = self.con.execute("select word_id, count(distinct url_id) from word_location " + "group by word_id").fetchall() + + for pair in words_urls: + word_id = pair[0] + num = pair[1] + idf = math.log10(url_count / num) + self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id)) + self.db_commit() + + + def count_vectors_length(self): + '''Count vector's length for each url (Euclidean norm of tf * idf for each word in url) + Set this value to the table url_list''' + print "Counting lengths..." + + url_count = self.con.execute("select url_id, sum(wcount), sum(count_idf) from " + "(select url_id, count(location) as wcount, " + "count(location) * count(location) * word_list.idf * word_list.idf as count_idf " + "from word_location join word_list on word_location.word_id=word_list.rowid " + "group by url_id, word_id) T1 " + "group by T1.url_id").fetchall() + + for url_record in url_count: + length = math.sqrt(url_record[2]) + length = length / url_record[1] + self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_record[0])) + + self.db_commit() + + def create_index_tables(self): """ Create database tables """ - self.con.execute('create table url_list(url)') - self.con.execute('create table word_list(word)') + res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone() + if res is not None: + return + + self.con.execute('create table url_list(url, length, title, authors, issue_id)') + self.con.execute('create table issue(jour_id, name, url)') + self.con.execute('create table journal(name)') + self.con.execute('create table word_list(word, idf)') self.con.execute('create table word_location(url_id, word_id, location)') self.con.execute('create table link(from_id integer, to_id integer)') self.con.execute('create table link_words(word_id, link_id)') diff --git a/searcher.py b/searcher.py new file mode 100644 index 0000000..ddf25c6 --- /dev/null +++ b/searcher.py @@ -0,0 +1,263 @@ +# Copyright 2013 Elizabeth Shashkova +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from BeautifulSoup import * +from pysqlite2 import dbapi2 as sqlite +import operator +import math +import heapq +import sys + +class Searcher: + SHOW_ANSWER = 10 + MAGIC_NUMBER = 20 + IDF_BOUND = 1 + COS_BOUND = 0.1 + + def __init__(self, db_name): + self.con = sqlite.connect(db_name) + + def __del__(self): + self.con.close() + + def db_commit(self): + self.con.commit() + + @staticmethod + def separate_words(text): + splitter = re.compile('\\W*') + return [s.lower() for s in splitter.split(text) if s != ''] + + + def get_top_words(self, words, n): + '''Return top n tf * idf words in text + Return list of words ''' + words_top = {word: 0 for word in words} + for word in words: + words_top[word] += 1 + + for word in words_top: + word_idf = self.con.execute( + "select idf from word_list where word = '%s'" % word).fetchone() + if word_idf is None: + words_top[word] = 0 + else: + word_idf = word_idf[0] + if word_idf > self.IDF_BOUND: + words_top[word] = words_top[word] * word_idf + else: + words_top[word] = 0 + + words_top = {word: words_top[word] for word in words_top if words_top[word] > 0} + + sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1), reverse=True) + clear_list = [pair[0] for pair in sorted_top] + if len(clear_list) <= n: + return clear_list + else: + return clear_list[0: n] + + + def find_rows(self, words, words_tf_idf_idf): + '''Find documents which contain one of words and count cos distance''' + if len(words) == 0: + return [] + + word_id_list = [] + table_num = 0 + #clause_list = '' + query = 'SELECT url_id, denorm_rank / url_list.length AS rank FROM (' \ + ' SELECT url_id, sum/count(*) AS denorm_rank FROM (' + fat_query = 'SELECT url_id as urlid, sum(weight) AS sum, count(*) AS match_words_num FROM (' + + for word in words: + word_row = self.con.execute( + "SELECT rowid FROM word_list WHERE word = '%s'" % word).fetchone() + if word_row is not None: + word_id = word_row[0] + word_id_list.append(word_id) + if table_num > 0: + fat_query += ' UNION ' + fat_query += 'SELECT url_id, count(*) * %f AS weight ' \ + 'FROM word_location WHERE word_id = %d ' \ + 'GROUP BY url_id' % (words_tf_idf_idf[word], word_id) + table_num += 1 + + fat_query += ') GROUP BY urlid ' + query += fat_query + query += ') as FatQuery JOIN word_location ON (FatQuery.urlid = word_location.url_id)' \ + 'GROUP BY urlid, sum, match_words_num)' \ + 'JOIN url_list ON url_list.rowid = url_id' +# print query +# print fat_query + result = self.con.execute(query) + rows = [row for row in result] + return rows + + + def tf(self, words): + '''Return tf of words''' + words_top = {word: 0 for word in words} + for word in words: + words_top[word] += 1 + words_freq = {word: words_top[word] / float(len(words)) for word in words_top} + + return words_freq + + def idf(self, word): + '''Return idf of word''' + idf = self.con.execute("select idf from word_list where word = '%s'" % word).fetchone() + if idf is None: + return 0 + else: + return idf[0] + + def get_top_tf_idf(self, words, top_words): + '''Return values of tf * idf * idf for words in top_words''' + tf_dict = self.tf(words) + idf_dict = {word: 0 for word in top_words} + + for word in tf_dict: + if word in top_words: + idf = self.idf(word) + idf_dict[word] = idf + tf_dict[word] = tf_dict[word] * idf * idf + else: + tf_dict[word] = 0 + + tf_dict = {word: tf_dict[word] for word in tf_dict if tf_dict[word] > 0} + inverted = [(word, tf_dict[word]) for word in tf_dict] + return inverted, idf_dict + + + def get_url_by_id(self, url_id): + '''Return url by its id''' + url = self.con.execute("select url_list.url, title, authors, journal.name, issue.name from url_list " + "join issue on url_list.issue_id = issue.rowid " + "join journal on issue.jour_id = journal.rowid " + "where url_list.rowid = '%s'" % url_id).fetchall()[0] + return url + + @staticmethod + def count_length(l, idfs): + '''Return Euclidean norm of the vector, saved in list of pairs''' + words_dict = {pair[0]: (pair[1] / idfs[pair[0]]) for pair in l} + length = 0 + for word in words_dict: + length = length + words_dict[word] * words_dict[word] + + length = math.sqrt(length) + return length + + @staticmethod + def cos_distance(url_words_tf_idf, url_words_length, words_tf_idf, words_length): + ''' Count cos distance between two vectors of words + url_words_tf_idf - list of pairs (word, tf * idf), where words are words from url + url_words_length - norm of vector url_words_tf_idf + words_tf_idf - list of pairs (word, tf * idf), where words are words from text + words_length - norm of vector words_tf_idf ''' + + if url_words_length == 0 or words_length == 0: + return 0 + + words_dict = {pair[0]: pair[1] for pair in words_tf_idf} + url_words_dict = {pair[0]: pair[1] for pair in url_words_tf_idf} + + sc_product = 0 + for word in words_dict: + if word in url_words_dict: + sc_product += words_dict[word] * url_words_dict[word] + + return sc_product / (url_words_length * words_length) + + + def cos_search(self, text, n): + ''' Start search by cos distance between text and documents(urls) + n means taking n top words from the text + top is counted by (tf * idf) ''' + + text_words = Searcher.separate_words(text) + top_text_words = self.get_top_words(text_words, n) + + answer = self.get_top_tf_idf(text_words, top_text_words) + text_words_tf_idf_idf = answer[0] + top_idfs = answer[1] + + text_length = Searcher.count_length(text_words_tf_idf_idf, top_idfs) + +# print top_text_words +# print text_words_tf_idf_idf + + url_ids_cos = self.find_rows(top_text_words, {word: tf_idf_idf for (word, tf_idf_idf) in text_words_tf_idf_idf}) + + #url_ids = [url_id[0] for url_id in url_ids] + url_count = len(url_ids_cos) + + url_full_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] + print >> sys.stderr, "Number of documents: %d " % url_full_count + + print >> sys.stderr, "Number of documents after cutting: %d " % url_count + print >> sys.stderr, "Searching..." + + ''' + heap = [] + url_ids = [] + for url_id in url_ids: + #print url_id + url_words = self.con.execute("select word from word_list join word_location on " + " word_list.rowid = word_location.word_id where " + " word_location.url_id = %s" % url_id).fetchall() + + url_words = [pair[0] for pair in url_words] + url_words_tf_idf = self.get_top_tf_idf(url_words, len(url_words)) + url_length = self.con.execute("select length from url_list where rowid = %d" % url_id).fetchone()[0] + url_cos = Searcher.cos_distance(url_words_tf_idf, url_length, text_words_tf_idf, text_length) + + if url_cos < self.COS_BOUND: + continue + + if len(heap) < self.SHOW_ANSWER: + heapq.heappush(heap, (url_cos, url_id)) + else: + heapq.heappushpop(heap, (url_cos, url_id)) + ''' + heap = [] + for url_pair in url_ids_cos: + url_cos = url_pair[1] / text_length + url_id = url_pair[0] + if len(heap) < self.SHOW_ANSWER: + heapq.heappush(heap, (url_cos, url_id)) + else: + heapq.heappushpop(heap, (url_cos, url_id)) + + + heap.sort(reverse=True) + top_n = [(pair[1], pair[0]) for pair in heap] + + print '{"articles": [' + number = 1 + for url_id in top_n: + if url_id[1] > 0: + if number > 1: + print "," + article_data = self.get_url_by_id(url_id[0]) + print "{" + print '"docid": %4d,' % url_id[0] + print '"rank": %f, ' % url_id[1] + print '"url": "%s", ' % article_data[0] + print '"title": "%s", ' % article_data[1] + print '"authors": "%s", ' % article_data[2].encode('utf-8') + print '"journal": "%s", ' % article_data[3] + print '"issue": "%s" ' % article_data[4] + print "}" + number += 1 + print "]}" \ No newline at end of file diff --git a/start_search b/start_search new file mode 100755 index 0000000..5ecee9c --- /dev/null +++ b/start_search @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +import searcher +s = searcher.Searcher('db_acm') +s.cos_search("Quality of service service service service service ", 10) + +