diff --git a/README.md b/README.md
index c964e68..d119c72 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,19 @@ Papeeria is an IDE for your computer science papers
 
 Article search system for Papeeria
 
+Crawling documents:
+-------------------
+
 Class Crawler creates index database for all articles' abstracts from the ACM Computing Surveys (CSUR) journal.
 Create search index database: 
 ./crawl_acm
+
+Searching:
+----------
+
+Class Searcher searchs top n documents by the text. Start searching:
+./start_searching
+
+This script calls method cos_search(...), which takes 2 arguments: text for searching and number of top words in text, using for searching. You can modify both of them.
+
+Method cos_search() implements searcher using tf-idf for counting top words in the text and cosine similarity to find appropriate documents.
diff --git a/crawl_acm b/crawl_acm
index 8ca0a7e..3e1fb71 100755
--- a/crawl_acm
+++ b/crawl_acm
@@ -3,6 +3,8 @@
 import search
 c = search.Crawler('db_acm')
 c.create_index_tables()
-c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603") 
+c.crawl("http://dl.acm.org/pub.cfm?id=J204", "ACM Computing Surveys (CSUR)")
+c.count_idf()
+c.count_vectors_length()
 
 
diff --git a/search.py b/search.py
index 506dc75..650347c 100755
--- a/search.py
+++ b/search.py
@@ -13,11 +13,18 @@
 import urllib2
 from BeautifulSoup import *
 from pysqlite2 import dbapi2 as sqlite
+import math
 
-ignore_words = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
 
 
 class Crawler:
+    ABSTRACT_TAB_NAME = "tab_abstract"
+    TABLE_OF_CONTENTS_TAB_NAME = "tab_about"
+    ARCHIVE_TAB_NAME = "pub_series"
+    BASE = "http://dl.acm.org/"
+    IS_PAPER_LINK = "citation"
+    ABS_NOT_AVAILABLE = "An abstract is not available."
+
     def __init__(self, db_name):
         self.con = sqlite.connect(db_name)
 
@@ -53,6 +60,33 @@ def get_entry_id(self, table, field, value, create_new=True):
         else:
             return res[0]
 
+    def get_entry_id_url_list(self, url, title, authors, issue_id, create_new=True):
+        """ Return id of row in table if this row exists
+        Else create this row and return id for url"""
+        cur = self.con.execute(
+            "select rowid from url_list where url = '%s'" % url)
+        res = cur.fetchone()
+        if res is None:
+            cur = self.con.execute(
+                "insert into url_list (url, title, authors, issue_id) values ('%s', '%s', '%s', '%s')"
+                % (url, title, authors, issue_id))
+            return cur.lastrowid
+        else:
+            return res[0]
+
+    def get_entry_id_issue(self, url, name, jour_id, create_new=True):
+        """ Return id of row in table if this row exists
+        Else create this row and return id for issue"""
+        cur = self.con.execute(
+            "select rowid from issue where url = '%s'" % url)
+        res = cur.fetchone()
+        if res is None:
+            cur = self.con.execute(
+                "insert into issue (jour_id, name, url) values ('%s', '%s', '%s')" % (jour_id, name, url))
+            return cur.lastrowid
+        else:
+            return res[0]
+
     def get_text_only(self, soup):
         """ Return text from Soup of page"""
         v = soup.string
@@ -71,20 +105,32 @@ def separate_words(self, text):
         splitter = re.compile('\\W*')
         return [s.lower() for s in splitter.split(text) if s != '']
 
-    def add_to_index(self, url, text):
+    def add_to_index(self, url, text, title, authors, count, issue_id):
         """ Add all words from text (from url) to database.
         This url becomes indexed """
         if self.is_indexed(url):
             return
-        print 'Indexing %s' % url
+        print '%4d Indexing %s' % (count, url)
+
+        if (title is None) and (text is None):
+            print "Neither text nor title are available"
+            return
+
+        words = []
+        if title is not None:
+            words = self.separate_words(title)
+
+        if (len(text) < 50) and (self.ABS_NOT_AVAILABLE in text):
+            print self.ABS_NOT_AVAILABLE
+        else:
+            words_from_abstract = self.separate_words(text)
+            for word in words_from_abstract:
+                words.append(word)
 
-        words = self.separate_words(text)
-        url_id = self.get_entry_id('url_list', 'url', url)
+        url_id = self.get_entry_id_url_list(url, title, authors, issue_id)
 
         for i in range(len(words)):
             word = words[i]
-            if word in ignore_words:
-                continue
             word_id = self.get_entry_id('word_list', 'word', word)
             #print word_id
             self.con.execute(
@@ -131,45 +177,155 @@ def get_list_of_links(self, url):
         links = soup('a')
         return links
 
+    def get_title(self, url):
+        req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
+        try:
+            con = urllib2.urlopen(req)
+        except:
+            print "I can't get title of: %s" % url
+            return
+        soup = BeautifulSoup(con.read())
+
+        authors = soup.findAll(attrs={"name":"citation_authors"})
+        if len(authors) > 0:
+            authors = authors[0]['content']
+        else:
+            authors = "Authors are unknown"
+        #print "Authors: %s" % authors[0]['content']
+        title = soup.findAll(attrs={"name":"citation_title"})
+        if len(title) > 0:
+            title = title[0]['content']
+        else:
+            title = "Title is unknown"
+        #print "Title: %s" % title[0]['content']
+        return title, authors
+
+
     def get_abstract_text(self, url):
         """ Return text of article's abstract"""
         req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"})
         try:
             con = urllib2.urlopen(req)
         except:
-            print "I can't open %s" % url
+            print "I can't open abstract: %s" % url
             return
         soup = BeautifulSoup(con.read())
         text = self.get_text_only(soup)
-        #print text
         return text
 
-    def crawl(self, journal_url, depth=2):
+    def delete_user_info(self, url):
+        """Delete user info from url"""
+        ind = url.find('&')
+        new_url = url[0: ind]
+        return new_url
+
+    def crawl(self, journal_url, name, depth=2):
         """ Begin crawling journal in ACM Library """
-        base = "http://dl.acm.org/"
-        link = self.open_tab(journal_url, "pub_series")
+
+        print " Journal link: " + journal_url
+        journal_id = self.get_entry_id('journal', 'name', name)
+
+        link = self.open_tab(journal_url, self.ARCHIVE_TAB_NAME)
         if link is None:
             return
-        archive_url = base + link
+        archive_url = self.BASE + link
         links = self.get_list_of_links(archive_url)
         if links is None:
             return
 
+        count = 1
         for link in links:
-            print "Journal link: " + base + link['href']
-            list_vol = self.open_tab(base + link['href'], "tab_about")
-            list_of_papers = self.get_list_of_links(base + list_vol)
+            info = link.string
+
+            #DEBUG
+            #if count > 20:
+            #    break
+            if not (link['href'].startswith("citation")):
+                continue
+
+            ref = self.delete_user_info(link['href'])
+            issue_id = self.get_entry_id_issue(self.BASE + ref, info, journal_id)
+
+            print "=============="
+            print " Issue link: " + self.BASE + ref
+            print "=============="
+            list_vol = self.open_tab(self.BASE + ref, self.TABLE_OF_CONTENTS_TAB_NAME)
+            list_of_papers = self.get_list_of_links(self.BASE + list_vol)
+
             for paper in list_of_papers:
-                if len(dict(paper.attrs)) == 1:
-                    paper_abstract = self.open_tab(base + paper['href'], "tab_abstract")
-                    text = self.get_abstract_text(base + paper_abstract)
-                    self.add_to_index(base + paper['href'], text)
+                #DEBUG
+                #if count > 20:
+                #    break
+                paper_ref = self.delete_user_info(paper['href'])
+
+                if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)):
+                    ref = self.BASE + paper_ref
+                    is_already_indexed = self.con.execute("select rowid from url_list where url = '%s'" %
+                                                        ref).fetchone()
+                    if is_already_indexed is not None:
+                        print "%4d %s is already indexed" % (count, ref)
+                        count += 1
+                        continue
+
+                    paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME)
+                    if paper_abstract is None:
+                        continue
+                    text = self.get_abstract_text(self.BASE + paper_abstract)
+                    meta = self.get_title(self.BASE + paper_ref)
+
+                    self.add_to_index(self.BASE + paper_ref, text, meta[0], meta[1], count, issue_id)
+                    count += 1
                     self.db_commit()
 
+        print "%4d papers were indexed" % (count - 1)
+
+
+    def count_idf(self):
+        '''Count idf for each word
+        Set this value to the table word_list'''
+        print "Counting idf..."
+        url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0]
+        words_urls = self.con.execute("select word_id, count(distinct url_id) from word_location "
+                         "group by word_id").fetchall()
+
+        for pair in words_urls:
+            word_id = pair[0]
+            num = pair[1]
+            idf = math.log10(url_count / num)
+            self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id))
+        self.db_commit()
+
+
+    def count_vectors_length(self):
+        '''Count vector's length for each url (Euclidean norm of tf * idf for each word in url)
+        Set this value to the table url_list'''
+        print "Counting lengths..."
+
+        url_count = self.con.execute("select url_id, sum(wcount), sum(count_idf) from "
+                                     "(select url_id, count(location) as wcount, "
+                                     "count(location) * count(location) * word_list.idf * word_list.idf as count_idf "
+                                     "from word_location join word_list on word_location.word_id=word_list.rowid  "
+                                     "group by url_id, word_id) T1  "
+                                     "group by T1.url_id").fetchall()
+
+        for url_record in url_count:
+            length = math.sqrt(url_record[2])
+            length = length / url_record[1]
+            self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_record[0]))
+
+        self.db_commit()
+
+
     def create_index_tables(self):
         """ Create database tables """
-        self.con.execute('create table url_list(url)')
-        self.con.execute('create table word_list(word)')
+        res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone()
+        if res is not None:
+            return
+
+        self.con.execute('create table url_list(url, length, title, authors, issue_id)')
+        self.con.execute('create table issue(jour_id, name, url)')
+        self.con.execute('create table journal(name)')
+        self.con.execute('create table word_list(word, idf)')
         self.con.execute('create table word_location(url_id, word_id, location)')
         self.con.execute('create table link(from_id integer, to_id integer)')
         self.con.execute('create table link_words(word_id, link_id)')
diff --git a/searcher.py b/searcher.py
new file mode 100644
index 0000000..ddf25c6
--- /dev/null
+++ b/searcher.py
@@ -0,0 +1,263 @@
+# Copyright 2013 Elizabeth Shashkova
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from BeautifulSoup import *
+from pysqlite2 import dbapi2 as sqlite
+import operator
+import math
+import heapq
+import sys
+
+class Searcher:
+    SHOW_ANSWER = 10
+    MAGIC_NUMBER = 20
+    IDF_BOUND = 1
+    COS_BOUND = 0.1
+
+    def __init__(self, db_name):
+        self.con = sqlite.connect(db_name)
+
+    def __del__(self):
+        self.con.close()
+
+    def db_commit(self):
+        self.con.commit()
+
+    @staticmethod
+    def separate_words(text):
+        splitter = re.compile('\\W*')
+        return [s.lower() for s in splitter.split(text) if s != '']
+
+
+    def get_top_words(self, words, n):
+        '''Return top n tf * idf words in text
+        Return list of words '''
+        words_top = {word: 0 for word in words}
+        for word in words:
+            words_top[word] += 1
+
+        for word in words_top:
+            word_idf = self.con.execute(
+                "select idf from word_list where word = '%s'" % word).fetchone()
+            if word_idf is None:
+                words_top[word] = 0
+            else:
+                word_idf = word_idf[0]
+                if word_idf > self.IDF_BOUND:
+                    words_top[word] = words_top[word] * word_idf
+                else:
+                    words_top[word] = 0
+
+        words_top = {word: words_top[word] for word in words_top if words_top[word] > 0}
+
+        sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1), reverse=True)
+        clear_list = [pair[0] for pair in sorted_top]
+        if len(clear_list) <= n:
+            return clear_list
+        else:
+            return clear_list[0: n]
+
+
+    def find_rows(self, words, words_tf_idf_idf):
+        '''Find documents which contain one of words and count cos distance'''
+        if len(words) == 0:
+            return []
+
+        word_id_list = []
+        table_num = 0
+        #clause_list = ''
+        query = 'SELECT url_id, denorm_rank / url_list.length AS rank FROM (' \
+                ' SELECT url_id, sum/count(*) AS denorm_rank FROM ('
+        fat_query = 'SELECT url_id as urlid, sum(weight) AS sum, count(*) AS match_words_num FROM ('
+
+        for word in words:
+            word_row = self.con.execute(
+                "SELECT rowid FROM word_list WHERE word = '%s'" % word).fetchone()
+            if word_row is not None:
+                word_id = word_row[0]
+                word_id_list.append(word_id)
+                if table_num > 0:
+                    fat_query += ' UNION '
+                fat_query += 'SELECT url_id, count(*) * %f AS weight ' \
+                         'FROM word_location WHERE word_id = %d ' \
+                         'GROUP BY url_id' % (words_tf_idf_idf[word], word_id)
+                table_num += 1
+
+        fat_query += ') GROUP BY urlid '
+        query += fat_query
+        query += ') as FatQuery JOIN word_location ON (FatQuery.urlid = word_location.url_id)' \
+                 'GROUP BY urlid, sum, match_words_num)' \
+                 'JOIN url_list ON url_list.rowid = url_id'
+#        print query
+#        print fat_query
+        result = self.con.execute(query)
+        rows = [row for row in result]
+        return rows
+
+
+    def tf(self, words):
+        '''Return tf of words'''
+        words_top = {word: 0 for word in words}
+        for word in words:
+            words_top[word] += 1
+        words_freq = {word: words_top[word] / float(len(words)) for word in words_top}
+
+        return words_freq
+
+    def idf(self, word):
+        '''Return idf of word'''
+        idf = self.con.execute("select idf from word_list where word = '%s'" % word).fetchone()
+        if idf is None:
+            return 0
+        else:
+            return idf[0]
+
+    def get_top_tf_idf(self, words, top_words):
+        '''Return values of tf * idf * idf for words in top_words'''
+        tf_dict = self.tf(words)
+        idf_dict = {word: 0 for word in top_words}
+
+        for word in tf_dict:
+            if word in top_words:
+                idf = self.idf(word)
+                idf_dict[word] = idf
+                tf_dict[word] = tf_dict[word] * idf * idf
+            else:
+                tf_dict[word] = 0
+
+        tf_dict = {word: tf_dict[word] for word in tf_dict if tf_dict[word] > 0}
+        inverted = [(word, tf_dict[word]) for word in tf_dict]
+        return inverted, idf_dict
+
+
+    def get_url_by_id(self, url_id):
+        '''Return url by its id'''
+        url = self.con.execute("select url_list.url, title, authors, journal.name, issue.name from url_list "
+                               "join issue on url_list.issue_id = issue.rowid "
+                               "join journal on issue.jour_id = journal.rowid "
+                               "where url_list.rowid = '%s'" % url_id).fetchall()[0]
+        return url
+
+    @staticmethod
+    def count_length(l, idfs):
+        '''Return Euclidean norm of the vector, saved in list of pairs'''
+        words_dict = {pair[0]: (pair[1] / idfs[pair[0]]) for pair in l}
+        length = 0
+        for word in words_dict:
+            length = length + words_dict[word] * words_dict[word]
+
+        length = math.sqrt(length)
+        return length
+
+    @staticmethod
+    def cos_distance(url_words_tf_idf, url_words_length, words_tf_idf, words_length):
+        ''' Count cos distance between two vectors of words
+        url_words_tf_idf - list of pairs (word, tf * idf), where words are words from url
+        url_words_length - norm of vector url_words_tf_idf
+        words_tf_idf     - list of pairs (word, tf * idf), where words are words from text
+        words_length     - norm of vector words_tf_idf '''
+
+        if url_words_length == 0 or words_length == 0:
+            return 0
+
+        words_dict = {pair[0]: pair[1] for pair in words_tf_idf}
+        url_words_dict = {pair[0]: pair[1] for pair in url_words_tf_idf}
+
+        sc_product = 0
+        for word in words_dict:
+            if word in url_words_dict:
+                sc_product += words_dict[word] * url_words_dict[word]
+
+        return sc_product / (url_words_length * words_length)
+
+
+    def cos_search(self, text, n):
+        ''' Start search by cos distance between text and documents(urls)
+        n means taking n top words from the text
+        top is counted by (tf * idf) '''
+
+        text_words = Searcher.separate_words(text)
+        top_text_words = self.get_top_words(text_words, n)
+
+        answer = self.get_top_tf_idf(text_words, top_text_words)
+        text_words_tf_idf_idf = answer[0]
+        top_idfs = answer[1]
+
+        text_length = Searcher.count_length(text_words_tf_idf_idf, top_idfs)
+
+#        print top_text_words
+#        print text_words_tf_idf_idf
+
+        url_ids_cos = self.find_rows(top_text_words, {word: tf_idf_idf for (word, tf_idf_idf) in text_words_tf_idf_idf})
+
+        #url_ids = [url_id[0] for url_id in url_ids]
+        url_count = len(url_ids_cos)
+
+        url_full_count = self.con.execute("select count(rowid) from url_list").fetchone()[0]
+        print >> sys.stderr, "Number of documents: %d " % url_full_count
+
+        print >> sys.stderr, "Number of documents after cutting: %d " % url_count
+        print >> sys.stderr, "Searching..."
+
+        '''
+        heap = []
+        url_ids = []
+        for url_id in url_ids:
+            #print url_id
+            url_words = self.con.execute("select word from word_list join word_location on "
+                                         " word_list.rowid = word_location.word_id where "
+                                         " word_location.url_id = %s" % url_id).fetchall()
+
+            url_words = [pair[0] for pair in url_words]
+            url_words_tf_idf = self.get_top_tf_idf(url_words, len(url_words))
+            url_length = self.con.execute("select length from url_list where rowid = %d" % url_id).fetchone()[0]
+            url_cos = Searcher.cos_distance(url_words_tf_idf, url_length, text_words_tf_idf, text_length)
+
+            if url_cos < self.COS_BOUND:
+                continue
+
+            if len(heap) < self.SHOW_ANSWER:
+                heapq.heappush(heap, (url_cos, url_id))
+            else:
+                heapq.heappushpop(heap, (url_cos, url_id))
+        '''
+        heap = []
+        for url_pair in url_ids_cos:
+            url_cos = url_pair[1] / text_length
+            url_id = url_pair[0]
+            if len(heap) < self.SHOW_ANSWER:
+                heapq.heappush(heap, (url_cos, url_id))
+            else:
+                heapq.heappushpop(heap, (url_cos, url_id))
+
+
+        heap.sort(reverse=True)
+        top_n = [(pair[1], pair[0]) for pair in heap]
+
+        print '{"articles": ['
+        number = 1
+        for url_id in top_n:
+            if url_id[1] > 0:
+                if number > 1:
+                    print ","
+                article_data = self.get_url_by_id(url_id[0])
+                print "{"
+                print '"docid": %4d,' % url_id[0]
+                print '"rank": %f, ' % url_id[1]
+                print '"url": "%s", ' % article_data[0]
+                print '"title": "%s", ' % article_data[1]
+                print '"authors": "%s", ' % article_data[2].encode('utf-8')
+                print '"journal": "%s", ' % article_data[3]
+                print '"issue": "%s" ' % article_data[4]
+                print "}"
+                number += 1
+        print "]}"
\ No newline at end of file
diff --git a/start_search b/start_search
new file mode 100755
index 0000000..5ecee9c
--- /dev/null
+++ b/start_search
@@ -0,0 +1,7 @@
+#!/usr/bin/env python 
+
+import searcher
+s = searcher.Searcher('db_acm')
+s.cos_search("Quality of service service service service service ", 10)
+
+