bardsoftware · Elizaveta239 · Dec 2, 2013 · Dec 3, 2013 · Dec 3, 2013 · Dec 3, 2013
diff --git a/README.md b/README.md
@@ -5,6 +5,19 @@ Papeeria is an IDE for your computer science papers
 
 Article search system for Papeeria
 
+Crawling documents:
+-------------------
+
 Class Crawler creates index database for all articles' abstracts from the ACM Computing Surveys (CSUR) journal.
 Create search index database: 
 ./crawl_acm
+
+Searching:
+----------
+
+Class Searcher searchs top n documents by the text. Start searching:
+./start_searching
+
+This script calls method cos_search(...), which takes 2 arguments: text for searching and number of top words in text, using for searching. You can modify both of them.
+
+Method cos_search() implements searcher using tf-idf for counting top words in the text and cosine similarity to find appropriate documents.
diff --git a/crawl_acm b/crawl_acm
@@ -3,6 +3,8 @@
 import search
 c = search.Crawler('db_acm')
 c.create_index_tables()
-c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603") 
+c.crawl("http://dl.acm.org/pub.cfm?id=J204", "ACM Computing Surveys (CSUR)")
+c.count_idf()
+c.count_vectors_length()
 
 
diff --git a/search.py b/search.py
@@ -13,11 +13,18 @@
 import urllib2
 from BeautifulSoup import *
 from pysqlite2 import dbapi2 as sqlite
+import math
 
-ignore_words = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
 
 
 class Crawler:
+    ABSTRACT_TAB_NAME = "tab_abstract"
+    TABLE_OF_CONTENTS_TAB_NAME = "tab_about"
+    ARCHIVE_TAB_NAME = "pub_series"
+    BASE = "http://dl.acm.org/"
+    IS_PAPER_LINK = "citation"
+    ABS_NOT_AVAILABLE = "An abstract is not available."
+
     def __init__(self, db_name):
         self.con = sqlite.connect(db_name)
 
@@ -53,6 +60,33 @@ def get_entry_id(self, table, field, value, create_new=True):
         else:
             return res[0]
 
+    def get_entry_id_url_list(self, url, title, authors, issue_id, create_new=True):
+        """ Return id of row in table if this row exists
+        Else create this row and return id for url"""
+        cur = self.con.execute(
+            "select rowid from url_list where url = '%s'" % url)
+        res = cur.fetchone()
+        if res is None:
+            cur = self.con.execute(
+                "insert into url_list (url, title, authors, issue_id) values ('%s', '%s', '%s', '%s')"
+                % (url, title, authors, issue_id))
+            return cur.lastrowid
+        else:
+            return res[0]
+
+    def get_entry_id_issue(self, url, name, jour_id, create_new=True):
+        """ Return id of row in table if this row exists
+        Else create this row and return id for issue"""
+        cur = self.con.execute(
+            "select rowid from issue where url = '%s'" % url)
+        res = cur.fetchone()
+        if res is None:
+            cur = self.con.execute(
+                "insert into issue (jour_id, name, url) values ('%s', '%s', '%s')" % (jour_id, name, url))
+            return cur.lastrowid
+        else:
+            return res[0]
+
     def get_text_only(self, soup):
         """ Return text from Soup of page"""
         v = soup.string
@@ -71,20 +105,32 @@ def separate_words(self, text):
         splitter = re.compile('\\W*')
         return [s.lower() for s in splitter.split(text) if s != '']
 
-    def add_to_index(self, url, text):
+    def add_to_index(self, url, text, title, authors, count, issue_id):
         """ Add all words from text (from url) to database.
         This url becomes indexed """
         if self.is_indexed(url):
             return
-        print 'Indexing %s' % url
+        print '%4d Indexing %s' % (count, url)
+
+        if (title is None) and (text is None):
+            print "Neither text nor title are available"
+            return
+
+        words = []
+        if title is not None:
+            words = self.separate_words(title)
+
+        if (len(text) < 50) and (self.ABS_NOT_AVAILABLE in text):
+            print self.ABS_NOT_AVAILABLE
+        else:
+            words_from_abstract = self.separate_words(text)
+            for word in words_from_abstract:
+                words.append(word)
 
-        words = self.separate_words(text)
-        url_id = self.get_entry_id('url_list', 'url', url)
+        url_id = self.get_entry_id_url_list(url, title, authors, issue_id)
 
         for i in range(len(words)):
             word = words[i]
-            if word in ignore_words:
-                continue
             word_id = self.get_entry_id('word_list', 'word', word)
             #print word_id
             self.con.execute(
@@ -131,45 +177,155 @@ def get_list_of_links(self, url):
         links = soup('a')
         return links
 
+    def get_title(self, url):
+        req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
+        try:
+            con = urllib2.urlopen(req)
+        except:
+            print "I can't get title of: %s" % url
+            return
+        soup = BeautifulSoup(con.read())
+
+        authors = soup.findAll(attrs={"name":"citation_authors"})
+        if len(authors) > 0:
+            authors = authors[0]['content']
+        else:
+            authors = "Authors are unknown"
+        #print "Authors: %s" % authors[0]['content']
+        title = soup.findAll(attrs={"name":"citation_title"})
+        if len(title) > 0:
+            title = title[0]['content']
+        else:
+            title = "Title is unknown"
+        #print "Title: %s" % title[0]['content']
+        return title, authors
+
+
     def get_abstract_text(self, url):
         """ Return text of article's abstract"""
         req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
         try:
             con = urllib2.urlopen(req)
         except:
-            print "I can't open %s" % url
+            print "I can't open abstract: %s" % url
             return
         soup = BeautifulSoup(con.read())
         text = self.get_text_only(soup)
-        #print text
         return text
 
-    def crawl(self, journal_url, depth=2):
+    def delete_user_info(self, url):
+        """Delete user info from url"""
+        ind = url.find('&')
+        new_url = url[0: ind]
+        return new_url
+
+    def crawl(self, journal_url, name, depth=2):
         """ Begin crawling journal in ACM Library """
-        base = "http://dl.acm.org/"
-        link = self.open_tab(journal_url, "pub_series")
+
+        print " Journal link: " + journal_url
+        journal_id = self.get_entry_id('journal', 'name', name)
+
+        link = self.open_tab(journal_url, self.ARCHIVE_TAB_NAME)
         if link is None:
             return
-        archive_url = base + link
+        archive_url = self.BASE + link
         links = self.get_list_of_links(archive_url)
         if links is None:
             return
 
+        count = 1
         for link in links:
-            print "Journal link: " + base + link['href']
-            list_vol = self.open_tab(base + link['href'], "tab_about")
-            list_of_papers = self.get_list_of_links(base + list_vol)
+            info = link.string
+
+            #DEBUG
+            #if count > 20:
+            #    break
+            if not (link['href'].startswith("citation")):
+                continue
+
+            ref = self.delete_user_info(link['href'])
+            issue_id = self.get_entry_id_issue(self.BASE + ref, info, journal_id)
+
+            print "=============="
+            print " Issue link: " + self.BASE + ref
+            print "=============="
+            list_vol = self.open_tab(self.BASE + ref, self.TABLE_OF_CONTENTS_TAB_NAME)
+            list_of_papers = self.get_list_of_links(self.BASE + list_vol)
+
             for paper in list_of_papers:
-                if len(dict(paper.attrs)) == 1:
-                    paper_abstract = self.open_tab(base + paper['href'], "tab_abstract")
-                    text = self.get_abstract_text(base + paper_abstract)
-                    self.add_to_index(base + paper['href'], text)
+                #DEBUG
+                #if count > 20:
+                #    break
+                paper_ref = self.delete_user_info(paper['href'])
+
+                if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)):
+                    ref = self.BASE + paper_ref
+                    is_already_indexed = self.con.execute("select rowid from url_list where url = '%s'" %
+                                                        ref).fetchone()
+                    if is_already_indexed is not None:
+                        print "%4d %s is already indexed" % (count, ref)
+                        count += 1
+                        continue
+
+                    paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME)
+                    if paper_abstract is None:
+                        continue
+                    text = self.get_abstract_text(self.BASE + paper_abstract)
+                    meta = self.get_title(self.BASE + paper_ref)
+
+                    self.add_to_index(self.BASE + paper_ref, text, meta[0], meta[1], count, issue_id)
+                    count += 1
                     self.db_commit()
 
+        print "%4d papers were indexed" % (count - 1)
+
+
+    def count_idf(self):
+        '''Count idf for each word
+        Set this value to the table word_list'''
+        print "Counting idf..."
+        url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0]
+        words_urls = self.con.execute("select word_id, count(distinct url_id) from word_location "
+                         "group by word_id").fetchall()
+
+        for pair in words_urls:
+            word_id = pair[0]
+            num = pair[1]
+            idf = math.log10(url_count / num)
+            self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id))
+        self.db_commit()
+
+
+    def count_vectors_length(self):
+        '''Count vector's length for each url (Euclidean norm of tf * idf for each word in url)
+        Set this value to the table url_list'''
+        print "Counting lengths..."
+
+        url_count = self.con.execute("select url_id, sum(wcount), sum(count_idf) from "
+                                     "(select url_id, count(location) as wcount, "
+                                     "count(location) * count(location) * word_list.idf * word_list.idf as count_idf "
+                                     "from word_location join word_list on word_location.word_id=word_list.rowid  "
+                                     "group by url_id, word_id) T1  "
+                                     "group by T1.url_id").fetchall()
+
+        for url_record in url_count:
+            length = math.sqrt(url_record[2])
+            length = length / url_record[1]
+            self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_record[0]))
+
+        self.db_commit()
+
+
     def create_index_tables(self):
         """ Create database tables """
-        self.con.execute('create table url_list(url)')
-        self.con.execute('create table word_list(word)')
+        res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone()
+        if res is not None:
+            return
+
+        self.con.execute('create table url_list(url, length, title, authors, issue_id)')
+        self.con.execute('create table issue(jour_id, name, url)')
+        self.con.execute('create table journal(name)')
+        self.con.execute('create table word_list(word, idf)')
         self.con.execute('create table word_location(url_id, word_id, location)')
         self.con.execute('create table link(from_id integer, to_id integer)')
         self.con.execute('create table link_words(word_id, link_id)')