From c05161818adebb459084f536ccef5af11d0c5ba7 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Mon, 2 Dec 2013 17:36:43 +0400 Subject: [PATCH 01/22] Some bugs fixed --- search.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/search.py b/search.py index 506dc75..3601036 100755 --- a/search.py +++ b/search.py @@ -159,15 +159,29 @@ def crawl(self, journal_url, depth=2): print "Journal link: " + base + link['href'] list_vol = self.open_tab(base + link['href'], "tab_about") list_of_papers = self.get_list_of_links(base + list_vol) + prefix = "citation" for paper in list_of_papers: - if len(dict(paper.attrs)) == 1: + if (len(dict(paper.attrs)) == 1) and (paper['href'].startswith(prefix)): paper_abstract = self.open_tab(base + paper['href'], "tab_abstract") + if paper_abstract is None: + print "I can't get paper's abstract: " + base + paper['href'] + continue text = self.get_abstract_text(base + paper_abstract) self.add_to_index(base + paper['href'], text) self.db_commit() def create_index_tables(self): """ Create database tables """ + res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone() + if res is not None: + self.con.execute('delete from url_list') + self.con.execute('delete from word_list') + self.con.execute('delete from word_location') + self.con.execute('delete from link') + self.con.execute('delete from link_words') + self.db_commit() + return + self.con.execute('create table url_list(url)') self.con.execute('create table word_list(word)') self.con.execute('create table word_location(url_id, word_id, location)') From 8ca9c46053f820e2b1fe2563298817ef431bf2dd Mon Sep 17 00:00:00 2001 From: elizabeth Date: Tue, 3 Dec 2013 13:25:35 +0400 Subject: [PATCH 02/22] Some new bugs fixed --- search.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/search.py b/search.py index 3601036..7fe8317 100755 --- a/search.py +++ b/search.py @@ -71,14 +71,21 @@ def separate_words(self, text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s != ''] - def add_to_index(self, url, text): + def add_to_index(self, url, text, title, count): """ Add all words from text (from url) to database. This url becomes indexed """ if self.is_indexed(url): return - print 'Indexing %s' % url + print '%4d Indexing %s' % (count, url) + + words = self.separate_words(title) + if (len(text) < 50) and ("An abstract is not available." in text): + print "An abstract is not available." + else: + words_from_abstract = self.separate_words(text) + for word in words_from_abstract: + words.append(word) - words = self.separate_words(text) url_id = self.get_entry_id('url_list', 'url', url) for i in range(len(words)): @@ -131,6 +138,17 @@ def get_list_of_links(self, url): links = soup('a') return links + def get_title(self, url): + req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"}) + try: + con = urllib2.urlopen(req) + except: + print "I can't open %s" % url + return + soup = BeautifulSoup(con.read()) + title = soup.title.string + return title + def get_abstract_text(self, url): """ Return text of article's abstract""" req = urllib2.Request(url, headers={'User-Agent': "Magic Browser"}) @@ -141,7 +159,6 @@ def get_abstract_text(self, url): return soup = BeautifulSoup(con.read()) text = self.get_text_only(soup) - #print text return text def crawl(self, journal_url, depth=2): @@ -155,21 +172,26 @@ def crawl(self, journal_url, depth=2): if links is None: return + count = 1 for link in links: + print "==============" print "Journal link: " + base + link['href'] + print "==============" list_vol = self.open_tab(base + link['href'], "tab_about") list_of_papers = self.get_list_of_links(base + list_vol) prefix = "citation" for paper in list_of_papers: if (len(dict(paper.attrs)) == 1) and (paper['href'].startswith(prefix)): paper_abstract = self.open_tab(base + paper['href'], "tab_abstract") - if paper_abstract is None: - print "I can't get paper's abstract: " + base + paper['href'] - continue text = self.get_abstract_text(base + paper_abstract) - self.add_to_index(base + paper['href'], text) + title = self.get_title(base + paper['href']) + + self.add_to_index(base + paper['href'], text, title, count) + count += 1 self.db_commit() + print "%4d papers were indexed" % count + def create_index_tables(self): """ Create database tables """ res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone() From 46f7e2a15b9b0678fdc1c9e7e98f42c5fb5b7df5 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Tue, 3 Dec 2013 13:40:37 +0400 Subject: [PATCH 03/22] String constants are created --- search.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/search.py b/search.py index 7fe8317..cc72e15 100755 --- a/search.py +++ b/search.py @@ -18,6 +18,12 @@ class Crawler: + ABSTRACT_TAB_NAME = "tab_abstract" + TABLE_OF_CONTENTS_TAB_NAME = "tab_about" + ARCHIVE_TAB_NAME = "pub_series" + BASE = "http://dl.acm.org/" + IS_PAPER_LINK = "citation" + def __init__(self, db_name): self.con = sqlite.connect(db_name) @@ -163,11 +169,11 @@ def get_abstract_text(self, url): def crawl(self, journal_url, depth=2): """ Begin crawling journal in ACM Library """ - base = "http://dl.acm.org/" - link = self.open_tab(journal_url, "pub_series") + + link = self.open_tab(journal_url, self.ARCHIVE_TAB_NAME) if link is None: return - archive_url = base + link + archive_url = self.BASE + link links = self.get_list_of_links(archive_url) if links is None: return @@ -175,23 +181,24 @@ def crawl(self, journal_url, depth=2): count = 1 for link in links: print "==============" - print "Journal link: " + base + link['href'] + print " Journal link: " + self.BASE + link['href'] print "==============" - list_vol = self.open_tab(base + link['href'], "tab_about") - list_of_papers = self.get_list_of_links(base + list_vol) - prefix = "citation" + list_vol = self.open_tab(self.BASE + link['href'], self.TABLE_OF_CONTENTS_TAB_NAME) + list_of_papers = self.get_list_of_links(self.BASE + list_vol) + for paper in list_of_papers: - if (len(dict(paper.attrs)) == 1) and (paper['href'].startswith(prefix)): - paper_abstract = self.open_tab(base + paper['href'], "tab_abstract") - text = self.get_abstract_text(base + paper_abstract) - title = self.get_title(base + paper['href']) + if (len(dict(paper.attrs)) == 1) and (paper['href'].startswith(self.IS_PAPER_LINK)): + paper_abstract = self.open_tab(self.BASE + paper['href'], self.ABSTRACT_TAB_NAME) + text = self.get_abstract_text(self.BASE + paper_abstract) + title = self.get_title(self.BASE + paper['href']) - self.add_to_index(base + paper['href'], text, title, count) + self.add_to_index(self.BASE + paper['href'], text, title, count) count += 1 self.db_commit() print "%4d papers were indexed" % count + def create_index_tables(self): """ Create database tables """ res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone() From 231abeb4e39061247f2e5ebbb524ac6eef471769 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Tue, 3 Dec 2013 17:19:47 +0400 Subject: [PATCH 04/22] And another new bugs fixed --- search.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/search.py b/search.py index cc72e15..95bb3fa 100755 --- a/search.py +++ b/search.py @@ -23,6 +23,7 @@ class Crawler: ARCHIVE_TAB_NAME = "pub_series" BASE = "http://dl.acm.org/" IS_PAPER_LINK = "citation" + ABS_NOT_AVAILABLE = "An abstract is not available." def __init__(self, db_name): self.con = sqlite.connect(db_name) @@ -85,8 +86,8 @@ def add_to_index(self, url, text, title, count): print '%4d Indexing %s' % (count, url) words = self.separate_words(title) - if (len(text) < 50) and ("An abstract is not available." in text): - print "An abstract is not available." + if (len(text) < 50) and (self.ABS_NOT_AVAILABLE in text): + print self.ABS_NOT_AVAILABLE else: words_from_abstract = self.separate_words(text) for word in words_from_abstract: @@ -167,6 +168,12 @@ def get_abstract_text(self, url): text = self.get_text_only(soup) return text + def delete_user_info(self, url): + """Delete user info from url""" + ind = url.find('&') + new_url = url[0: ind] + return new_url + def crawl(self, journal_url, depth=2): """ Begin crawling journal in ACM Library """ @@ -180,19 +187,23 @@ def crawl(self, journal_url, depth=2): count = 1 for link in links: + ref = self.delete_user_info(link['href']) print "==============" - print " Journal link: " + self.BASE + link['href'] + print " Journal link: " + self.BASE + ref print "==============" - list_vol = self.open_tab(self.BASE + link['href'], self.TABLE_OF_CONTENTS_TAB_NAME) + list_vol = self.open_tab(self.BASE + ref, self.TABLE_OF_CONTENTS_TAB_NAME) list_of_papers = self.get_list_of_links(self.BASE + list_vol) for paper in list_of_papers: - if (len(dict(paper.attrs)) == 1) and (paper['href'].startswith(self.IS_PAPER_LINK)): - paper_abstract = self.open_tab(self.BASE + paper['href'], self.ABSTRACT_TAB_NAME) + paper_ref = self.delete_user_info(paper['href']) + if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)): + paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME) + if paper_abstract is None: + continue text = self.get_abstract_text(self.BASE + paper_abstract) - title = self.get_title(self.BASE + paper['href']) + title = self.get_title(self.BASE + paper_ref) - self.add_to_index(self.BASE + paper['href'], text, title, count) + self.add_to_index(self.BASE + paper_ref, text, title, count) count += 1 self.db_commit() From 9e5ee103a0ad993a6b57dc3a2aa35f880b0f159c Mon Sep 17 00:00:00 2001 From: elizabeth Date: Thu, 5 Dec 2013 23:42:04 +0400 Subject: [PATCH 05/22] Some changes --- search.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/search.py b/search.py index 95bb3fa..00f6e3a 100755 --- a/search.py +++ b/search.py @@ -85,7 +85,15 @@ def add_to_index(self, url, text, title, count): return print '%4d Indexing %s' % (count, url) - words = self.separate_words(title) + if (title is None) and (text is None): + print "Neither text nor title are available" + return + + words = [] + if title is not None: + #print title + words = self.separate_words(title) + if (len(text) < 50) and (self.ABS_NOT_AVAILABLE in text): print self.ABS_NOT_AVAILABLE else: @@ -150,7 +158,7 @@ def get_title(self, url): try: con = urllib2.urlopen(req) except: - print "I can't open %s" % url + print "I can't get title of: %s" % url return soup = BeautifulSoup(con.read()) title = soup.title.string @@ -162,7 +170,7 @@ def get_abstract_text(self, url): try: con = urllib2.urlopen(req) except: - print "I can't open %s" % url + print "I can't open abstract: %s" % url return soup = BeautifulSoup(con.read()) text = self.get_text_only(soup) From 2864030e81ac0c622d1d6acc97c4873d66c68d07 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Thu, 5 Dec 2013 23:43:17 +0400 Subject: [PATCH 06/22] Very simple searcher --- searcher.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++ start_search | 7 ++++ 2 files changed, 99 insertions(+) create mode 100644 searcher.py create mode 100755 start_search diff --git a/searcher.py b/searcher.py new file mode 100644 index 0000000..48bf076 --- /dev/null +++ b/searcher.py @@ -0,0 +1,92 @@ +# Copyright 2013 Elizabeth Shashkova +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from BeautifulSoup import * +from pysqlite2 import dbapi2 as sqlite +import operator +import search + +class Searcher: + + def __init__(self, db_name): + self.con = sqlite.connect(db_name) + + def __del__(self): + self.con.close() + + def separate_words(self, text): + splitter = re.compile('\\W*') + return [s.lower() for s in splitter.split(text) if s != ''] + + def get_top_words(self, text, n): + '''Return top n words in text''' + words = self.separate_words(text) + words_top = {} + for word in words: + if word not in search.ignore_words: + if word in words_top: + words_top[word] += 1 + else: + words_top[word] = 1 + sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1)) + words_only = [pair[0] for pair in sorted_top] + + if len(sorted_top) <= n: + return words_only + else: + return words_only[len(sorted_top) - n: len(sorted_top)] + + + def find_rows(self, words): + '''Find documents which contain _all_ words''' + word_id_list = [] + table_num = 0 + field_list = 'w0.url_id' + table_list = '' + clause_list = '' + + for word in words: + word_row = self.con.execute( + "select rowid from word_list where word = '%s'" % word).fetchone() + if word_row is not None: + word_id = word_row[0] + #print "word_id: %d" % word_id + word_id_list.append(word_id) + if table_num > 0: + table_list += ', ' + clause_list += ' and ' + clause_list += 'w%d.url_id = w%d.url_id and ' % (table_num - 1, table_num) + #field_list += ', w%d.location' % table_num + table_list += 'word_location w%d' % table_num + clause_list += 'w%d.word_id = %d' % (table_num, word_id) + table_num += 1 + + query = 'select distinct %s from %s where %s ' % (field_list, table_list, clause_list) + result = self.con.execute(query) + rows = [row for row in result] + return rows + + def get_url_by_id(self, url_id): + '''Return url by its id''' + url = self.con.execute("select url from url_list where rowid = '%s'" % url_id).fetchone()[0] + return url + + def top_search(self, text, n): + '''Start search by n top words in text''' + words = self.get_top_words(text, n) + url_id_list = self.find_rows(words) + for url_id in url_id_list: + print self.get_url_by_id(url_id) + + if len(url_id_list) == 0: + print "I can't find anything. Sorry... :(" + diff --git a/start_search b/start_search new file mode 100755 index 0000000..a230a75 --- /dev/null +++ b/start_search @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +import searcher +s = searcher.Searcher('db_acm') +s.top_search("Quality of service (QoS) can be a critical element for achieving the business goals of a service provider, for the acceptance of a service by the user, or for guaranteeing service characteristics in a composition of services, where a service is defined as either a software or a software-support (i.e., infrastructural) service which is available on any type of network or electronic channel. The goal of this article is to compare the approaches to QoS description in the literature, where several models and metamodels are included. consider a large spectrum of models and metamodels to describe service quality, ranging from ontological approaches to define quality measures, metrics, and dimensions, to metamodels enabling the specification of quality-based service requirements and capabilities as well as of SLAs (Service-Level Agreements) and SLA templates for service provisioning.", 10) + + From 0b22cfe0d77105bd762282c20628288f70a8de59 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Tue, 17 Dec 2013 01:01:33 +0400 Subject: [PATCH 07/22] Simple searcher using cosine similarity is implemented --- search.py | 5 +- searcher.py | 134 +++++++++++++++++++++++++++++++++++++++++++++------ start_search | 3 +- 3 files changed, 123 insertions(+), 19 deletions(-) diff --git a/search.py b/search.py index 00f6e3a..3db1657 100755 --- a/search.py +++ b/search.py @@ -14,7 +14,6 @@ from BeautifulSoup import * from pysqlite2 import dbapi2 as sqlite -ignore_words = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class Crawler: @@ -105,8 +104,6 @@ def add_to_index(self, url, text, title, count): for i in range(len(words)): word = words[i] - if word in ignore_words: - continue word_id = self.get_entry_id('word_list', 'word', word) #print word_id self.con.execute( @@ -231,7 +228,7 @@ def create_index_tables(self): return self.con.execute('create table url_list(url)') - self.con.execute('create table word_list(word)') + self.con.execute('create table word_list(word, idf)') self.con.execute('create table word_location(url_id, word_id, location)') self.con.execute('create table link(from_id integer, to_id integer)') self.con.execute('create table link_words(word_id, link_id)') diff --git a/searcher.py b/searcher.py index 48bf076..9c4b193 100644 --- a/searcher.py +++ b/searcher.py @@ -14,8 +14,10 @@ from pysqlite2 import dbapi2 as sqlite import operator import search +import math class Searcher: + SHOW_ANSWER = 10 def __init__(self, db_name): self.con = sqlite.connect(db_name) @@ -23,27 +25,68 @@ def __init__(self, db_name): def __del__(self): self.con.close() + def db_commit(self): + self.con.commit() + def separate_words(self, text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s != ''] - def get_top_words(self, text, n): + def get_top_words(self, words, n): '''Return top n words in text''' - words = self.separate_words(text) - words_top = {} + words_top = {word: 0 for word in words} for word in words: - if word not in search.ignore_words: - if word in words_top: - words_top[word] += 1 - else: - words_top[word] = 1 - sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1)) - words_only = [pair[0] for pair in sorted_top] + words_top[word] += 1 + + sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1), reverse=True) + #print sorted_top if len(sorted_top) <= n: - return words_only + return sorted_top else: - return words_only[len(sorted_top) - n: len(sorted_top)] + return sorted_top[0: n] + + def count_idf(self): + print "Counting idf..." + url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] + words_count = self.con.execute("select count(rowid) from word_list").fetchone()[0] + max = 0 + for word_id in range(1, words_count + 1): + urls = self.con.execute("select distinct url_id from word_location " + "where word_id = %s" % word_id).fetchall() + num = len(urls) + if num > max: + max = num + idf = math.log10(url_count / num) + self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id)) + self.db_commit() + + + def tf(self, words): + '''Return top n words in text''' + words_top = {word: 0 for word in words} + for word in words: + words_top[word] += 1 + words_top = {word: words_top[word] / float(len(words)) for word in words_top} + + sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1), reverse=True) + return sorted_top + + + def get_top_tf_idf(self, words, n): + '''Get top n words from list words using tf * idf''' + tf_list = self.tf(words) + top_dict = {pair[0]: pair[1] for pair in tf_list} + for word in top_dict: + idf = self.con.execute("select idf from word_list where word = '%s'" % word).fetchone()[0] + top_dict[word] = top_dict[word] * idf + + sorted_top = sorted(top_dict.iteritems(), key=operator.itemgetter(1), reverse=True) + + if len(sorted_top) <= n: + return sorted_top + else: + return sorted_top[0: n] def find_rows(self, words): @@ -81,8 +124,10 @@ def get_url_by_id(self, url_id): return url def top_search(self, text, n): - '''Start search by n top words in text''' - words = self.get_top_words(text, n) + '''Start search by n top words in text + top is counted by number of repeats''' + words_pairs = self.get_top_words(self.separate_words(text), n) + words = [pair[0] for pair in words_pairs] url_id_list = self.find_rows(words) for url_id in url_id_list: print self.get_url_by_id(url_id) @@ -90,3 +135,64 @@ def top_search(self, text, n): if len(url_id_list) == 0: print "I can't find anything. Sorry... :(" + + def cos_distance(self, url_id, words): + '''Count cos distance between list of words and article with id url = url_id''' + sum = 0 + url_words = self.con.execute("select word from word_list join word_location on " + "word_list.rowid = word_location.word_id where " + "word_location.url_id = %s" % url_id).fetchall() + + url_words = [pair[0] for pair in url_words] + url_words_counted = self.get_top_words(url_words, len(url_words)) + #list of pairs + + words_dict = {pair[0]: pair[1] for pair in words} + url_words_dict = {pair[0]: pair[1] for pair in url_words_counted} + + for word in words_dict: + if word in url_words_dict: + sum = sum + words_dict[word] * url_words_dict[word] + + length1 = 0 + for word in words_dict: + length1 = length1 + words_dict[word] * words_dict[word] + length1 = math.sqrt(length1) + + length2 = 0 + for word in url_words_dict: + length2 = length2 + url_words_dict[word] * url_words_dict[word] + length2 = math.sqrt(length2) + + cos = sum / (length1 * length2) + return cos + + + def cos_search(self, text, n): + ''' Start search by cos distance between text and documents + n means taking n top words from the text + top is counted by (number of repeats) * (idf) ''' + + all_words = self.separate_words(text) + words = self.get_top_tf_idf(all_words, n) + #list of pairs + + url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] + print "Number of documents is %s" % url_count + print "Searching..." + + url_cos = {url_id: self.cos_distance(url_id, words) for url_id in range(1, url_count + 1)} + top_n = sorted(url_cos.iteritems(), key=operator.itemgetter(1), reverse=True) + top_n = top_n[0: self.SHOW_ANSWER] + + print "Answer: " + for url_id in top_n: + print "id = %4s cos = %s" % (url_id[0], url_id[1]) + print self.get_url_by_id(url_id[0]) + + + + + + + diff --git a/start_search b/start_search index a230a75..3bb0776 100755 --- a/start_search +++ b/start_search @@ -2,6 +2,7 @@ import searcher s = searcher.Searcher('db_acm') -s.top_search("Quality of service (QoS) can be a critical element for achieving the business goals of a service provider, for the acceptance of a service by the user, or for guaranteeing service characteristics in a composition of services, where a service is defined as either a software or a software-support (i.e., infrastructural) service which is available on any type of network or electronic channel. The goal of this article is to compare the approaches to QoS description in the literature, where several models and metamodels are included. consider a large spectrum of models and metamodels to describe service quality, ranging from ontological approaches to define quality measures, metrics, and dimensions, to metamodels enabling the specification of quality-based service requirements and capabilities as well as of SLAs (Service-Level Agreements) and SLA templates for service provisioning.", 10) +s.count_idf() +s.cos_search("Quality of service (QoS) can be a critical element for achieving the business goals of a service provider, for the acceptance of a service by the user, or for guaranteeing service characteristics in a composition of services, where a service is defined as either a software or a software-support (i.e., infrastructural) service which is available on any type of network or electronic channel. The goal of this article is to compare the approaches to QoS description in the literature, where several models and metamodels are included. consider a large spectrum of models and metamodels to describe service quality, ranging from ontological approaches to define quality measures, metrics, and dimensions, to metamodels enabling the specification of quality-based service requirements and capabilities as well as of SLAs (Service-Level Agreements) and SLA templates for service provisioning.", 10) From 835b1da7788ef7a58c0177ce91417606d01c0c52 Mon Sep 17 00:00:00 2001 From: Elizabeth Shashkova Date: Tue, 17 Dec 2013 00:11:11 +0300 Subject: [PATCH 08/22] Update README.md --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index c964e68..d119c72 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,19 @@ Papeeria is an IDE for your computer science papers Article search system for Papeeria +Crawling documents: +------------------- + Class Crawler creates index database for all articles' abstracts from the ACM Computing Surveys (CSUR) journal. Create search index database: ./crawl_acm + +Searching: +---------- + +Class Searcher searchs top n documents by the text. Start searching: +./start_searching + +This script calls method cos_search(...), which takes 2 arguments: text for searching and number of top words in text, using for searching. You can modify both of them. + +Method cos_search() implements searcher using tf-idf for counting top words in the text and cosine similarity to find appropriate documents. From 35a50d2b04772512a19ad55ba5ca76d68cb82d5b Mon Sep 17 00:00:00 2001 From: elizabeth Date: Wed, 18 Dec 2013 17:04:12 +0400 Subject: [PATCH 09/22] idf counting replaced to crawl_acm --- crawl_acm | 3 ++- search.py | 17 +++++++++++++++++ searcher.py | 15 --------------- start_search | 1 - 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/crawl_acm b/crawl_acm index 8ca0a7e..0d9c2ee 100755 --- a/crawl_acm +++ b/crawl_acm @@ -3,6 +3,7 @@ import search c = search.Crawler('db_acm') c.create_index_tables() -c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603") +c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603") +c.count_idf() diff --git a/search.py b/search.py index 3db1657..a1acc83 100755 --- a/search.py +++ b/search.py @@ -13,6 +13,7 @@ import urllib2 from BeautifulSoup import * from pysqlite2 import dbapi2 as sqlite +import math @@ -215,6 +216,22 @@ def crawl(self, journal_url, depth=2): print "%4d papers were indexed" % count + def count_idf(self): + print "Counting idf..." + url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] + words_count = self.con.execute("select count(rowid) from word_list").fetchone()[0] + max = 0 + for word_id in range(1, words_count + 1): + urls = self.con.execute("select distinct url_id from word_location " + "where word_id = %s" % word_id).fetchall() + num = len(urls) + if num > max: + max = num + idf = math.log10(url_count / num) + self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id)) + self.db_commit() + + def create_index_tables(self): """ Create database tables """ res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone() diff --git a/searcher.py b/searcher.py index 9c4b193..b79177d 100644 --- a/searcher.py +++ b/searcher.py @@ -46,21 +46,6 @@ def get_top_words(self, words, n): else: return sorted_top[0: n] - def count_idf(self): - print "Counting idf..." - url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] - words_count = self.con.execute("select count(rowid) from word_list").fetchone()[0] - max = 0 - for word_id in range(1, words_count + 1): - urls = self.con.execute("select distinct url_id from word_location " - "where word_id = %s" % word_id).fetchall() - num = len(urls) - if num > max: - max = num - idf = math.log10(url_count / num) - self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id)) - self.db_commit() - def tf(self, words): '''Return top n words in text''' diff --git a/start_search b/start_search index 3bb0776..69f89d3 100755 --- a/start_search +++ b/start_search @@ -2,7 +2,6 @@ import searcher s = searcher.Searcher('db_acm') -s.count_idf() s.cos_search("Quality of service (QoS) can be a critical element for achieving the business goals of a service provider, for the acceptance of a service by the user, or for guaranteeing service characteristics in a composition of services, where a service is defined as either a software or a software-support (i.e., infrastructural) service which is available on any type of network or electronic channel. The goal of this article is to compare the approaches to QoS description in the literature, where several models and metamodels are included. consider a large spectrum of models and metamodels to describe service quality, ranging from ontological approaches to define quality measures, metrics, and dimensions, to metamodels enabling the specification of quality-based service requirements and capabilities as well as of SLAs (Service-Level Agreements) and SLA templates for service provisioning.", 10) From b2e93e848700132c37776eba202945acf66f40eb Mon Sep 17 00:00:00 2001 From: elizabeth Date: Mon, 23 Dec 2013 01:01:52 +0400 Subject: [PATCH 10/22] remove old top searcher --- searcher.py | 45 ++------------------------------------------- 1 file changed, 2 insertions(+), 43 deletions(-) diff --git a/searcher.py b/searcher.py index b79177d..47d62c6 100644 --- a/searcher.py +++ b/searcher.py @@ -13,8 +13,8 @@ from BeautifulSoup import * from pysqlite2 import dbapi2 as sqlite import operator -import search import math +import heapq class Searcher: SHOW_ANSWER = 10 @@ -48,7 +48,7 @@ def get_top_words(self, words, n): def tf(self, words): - '''Return top n words in text''' + '''Return sorted words''' words_top = {word: 0 for word in words} for word in words: words_top[word] += 1 @@ -74,52 +74,11 @@ def get_top_tf_idf(self, words, n): return sorted_top[0: n] - def find_rows(self, words): - '''Find documents which contain _all_ words''' - word_id_list = [] - table_num = 0 - field_list = 'w0.url_id' - table_list = '' - clause_list = '' - - for word in words: - word_row = self.con.execute( - "select rowid from word_list where word = '%s'" % word).fetchone() - if word_row is not None: - word_id = word_row[0] - #print "word_id: %d" % word_id - word_id_list.append(word_id) - if table_num > 0: - table_list += ', ' - clause_list += ' and ' - clause_list += 'w%d.url_id = w%d.url_id and ' % (table_num - 1, table_num) - #field_list += ', w%d.location' % table_num - table_list += 'word_location w%d' % table_num - clause_list += 'w%d.word_id = %d' % (table_num, word_id) - table_num += 1 - - query = 'select distinct %s from %s where %s ' % (field_list, table_list, clause_list) - result = self.con.execute(query) - rows = [row for row in result] - return rows - def get_url_by_id(self, url_id): '''Return url by its id''' url = self.con.execute("select url from url_list where rowid = '%s'" % url_id).fetchone()[0] return url - def top_search(self, text, n): - '''Start search by n top words in text - top is counted by number of repeats''' - words_pairs = self.get_top_words(self.separate_words(text), n) - words = [pair[0] for pair in words_pairs] - url_id_list = self.find_rows(words) - for url_id in url_id_list: - print self.get_url_by_id(url_id) - - if len(url_id_list) == 0: - print "I can't find anything. Sorry... :(" - def cos_distance(self, url_id, words): '''Count cos distance between list of words and article with id url = url_id''' From fa781bc1b77253f06084438b57363cc3148c9005 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Mon, 23 Dec 2013 15:20:35 +0400 Subject: [PATCH 11/22] Length of vectors counting and top n finding changed --- crawl_acm | 1 + search.py | 40 +++++++++++++++++++------- searcher.py | 83 +++++++++++++++++++++++++++++------------------------ 3 files changed, 76 insertions(+), 48 deletions(-) diff --git a/crawl_acm b/crawl_acm index 0d9c2ee..5756822 100755 --- a/crawl_acm +++ b/crawl_acm @@ -5,5 +5,6 @@ c = search.Crawler('db_acm') c.create_index_tables() c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603") c.count_idf() +c.count_vectors_length() diff --git a/search.py b/search.py index a1acc83..fd39b84 100755 --- a/search.py +++ b/search.py @@ -213,25 +213,45 @@ def crawl(self, journal_url, depth=2): count += 1 self.db_commit() - print "%4d papers were indexed" % count + print "%4d papers were indexed" % (count - 1) def count_idf(self): + '''Count idf for each word + Set this value to the table word_list''' print "Counting idf..." url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] - words_count = self.con.execute("select count(rowid) from word_list").fetchone()[0] - max = 0 - for word_id in range(1, words_count + 1): - urls = self.con.execute("select distinct url_id from word_location " - "where word_id = %s" % word_id).fetchall() - num = len(urls) - if num > max: - max = num + words_urls = self.con.execute("select word_id, count(distinct url_id) from word_location " + "group by word_id").fetchall() + + for pair in words_urls: + word_id = pair[0] + num = pair[1] idf = math.log10(url_count / num) self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id)) self.db_commit() + def count_vectors_length(self): + '''Count vector's length for each url + Set this value to the table url_list''' + print "Counting lengths..." + url_ids = self.con.execute("select rowid from url_list").fetchall() + url_ids = (url_id[0] for url_id in url_ids) + + for url_id in url_ids: + words_count = self.con.execute("select word_id, count(word_id) from word_location where " + " url_id = %d group by word_id" % url_id).fetchall() + words_dict = {pair[0]: pair[1] for pair in words_count} + length = 0 + for word in words_dict: + length = length + words_dict[word] * words_dict[word] + length = math.sqrt(length) + self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_id)) + + self.db_commit() + + def create_index_tables(self): """ Create database tables """ res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone() @@ -244,7 +264,7 @@ def create_index_tables(self): self.db_commit() return - self.con.execute('create table url_list(url)') + self.con.execute('create table url_list(url, length)') self.con.execute('create table word_list(word, idf)') self.con.execute('create table word_location(url_id, word_id, location)') self.con.execute('create table link(from_id integer, to_id integer)') diff --git a/searcher.py b/searcher.py index 47d62c6..864ed7a 100644 --- a/searcher.py +++ b/searcher.py @@ -39,16 +39,27 @@ def get_top_words(self, words, n): words_top[word] += 1 sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1), reverse=True) - #print sorted_top - if len(sorted_top) <= n: return sorted_top else: return sorted_top[0: n] + def get_top_n_from_dict(self, dic, n): + '''Return list of pairs (key, value) + sorted top n values from dictionary dic ''' + heap = [] + for word in dic: + if len(heap) < n: + heapq.heappush(heap, (dic[word], word)) + else: + heapq.heappushpop(heap, (dic[word], word)) + heap.sort(reverse=True) + inverted = [(pair[1], pair[0]) for pair in heap] + return inverted + def tf(self, words): - '''Return sorted words''' + '''Return words sorted by their tf''' words_top = {word: 0 for word in words} for word in words: words_top[word] += 1 @@ -66,12 +77,8 @@ def get_top_tf_idf(self, words, n): idf = self.con.execute("select idf from word_list where word = '%s'" % word).fetchone()[0] top_dict[word] = top_dict[word] * idf - sorted_top = sorted(top_dict.iteritems(), key=operator.itemgetter(1), reverse=True) - - if len(sorted_top) <= n: - return sorted_top - else: - return sorted_top[0: n] + sorted_top = self.get_top_n_from_dict(top_dict, n) + return sorted_top def get_url_by_id(self, url_id): @@ -79,36 +86,39 @@ def get_url_by_id(self, url_id): url = self.con.execute("select url from url_list where rowid = '%s'" % url_id).fetchone()[0] return url + def count_length(self, l): + '''Return length of the vector saved in list of pairs''' + words_dict = {pair[0]: pair[1] for pair in l} + length = 0 + for word in words_dict: + length = length + words_dict[word] * words_dict[word] + + length = math.sqrt(length) + return length + + def cos_distance(self, url_id, words_tf_idf, length): + '''Count cos distance between list of pairs (word, tf * idf) + and article with id url = url_id + length is a length of vector, it doesn't affect the order of documents, but it makes + cos values between 0 and 1''' - def cos_distance(self, url_id, words): - '''Count cos distance between list of words and article with id url = url_id''' - sum = 0 url_words = self.con.execute("select word from word_list join word_location on " "word_list.rowid = word_location.word_id where " "word_location.url_id = %s" % url_id).fetchall() url_words = [pair[0] for pair in url_words] url_words_counted = self.get_top_words(url_words, len(url_words)) - #list of pairs - words_dict = {pair[0]: pair[1] for pair in words} + words_dict = {pair[0]: pair[1] for pair in words_tf_idf} url_words_dict = {pair[0]: pair[1] for pair in url_words_counted} + sum = 0 for word in words_dict: if word in url_words_dict: sum = sum + words_dict[word] * url_words_dict[word] - length1 = 0 - for word in words_dict: - length1 = length1 + words_dict[word] * words_dict[word] - length1 = math.sqrt(length1) - - length2 = 0 - for word in url_words_dict: - length2 = length2 + url_words_dict[word] * url_words_dict[word] - length2 = math.sqrt(length2) - - cos = sum / (length1 * length2) + url_length = self.con.execute("select length from url_list where rowid = %d" % url_id).fetchone()[0] + cos = sum / (url_length * length) return cos @@ -118,25 +128,22 @@ def cos_search(self, text, n): top is counted by (number of repeats) * (idf) ''' all_words = self.separate_words(text) - words = self.get_top_tf_idf(all_words, n) + words_top_tf_idf = self.get_top_tf_idf(all_words, n) + print words_top_tf_idf + length = self.count_length(words_top_tf_idf) #list of pairs - url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] - print "Number of documents is %s" % url_count + url_ids = self.con.execute("select rowid from url_list").fetchall() + url_ids = [url_id[0] for url_id in url_ids] + url_count = len(url_ids) + print "Number of documents is %d" % url_count print "Searching..." - url_cos = {url_id: self.cos_distance(url_id, words) for url_id in range(1, url_count + 1)} - top_n = sorted(url_cos.iteritems(), key=operator.itemgetter(1), reverse=True) - top_n = top_n[0: self.SHOW_ANSWER] + url_cos = {url_id: self.cos_distance(url_id, words_top_tf_idf, length) for url_id in url_ids} + top_n = self.get_top_n_from_dict(url_cos, self.SHOW_ANSWER) print "Answer: " for url_id in top_n: - print "id = %4s cos = %s" % (url_id[0], url_id[1]) + print "id = %4d cos = %f" % (url_id[0], url_id[1]) print self.get_url_by_id(url_id[0]) - - - - - - From 0b245bcb9bc53d7e1d3f62df8a89b9554ab81b00 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Thu, 26 Dec 2013 13:03:33 +0400 Subject: [PATCH 12/22] Both vectors consist of tf * idf --- search.py | 15 ++++++---- searcher.py | 85 ++++++++++++++++++++++++++++++---------------------- start_search | 2 +- 3 files changed, 61 insertions(+), 41 deletions(-) diff --git a/search.py b/search.py index fd39b84..70a54c2 100755 --- a/search.py +++ b/search.py @@ -233,20 +233,25 @@ def count_idf(self): def count_vectors_length(self): - '''Count vector's length for each url + '''Count vector's length for each url (Euclidean norm of tf * idf for each word in url) Set this value to the table url_list''' print "Counting lengths..." url_ids = self.con.execute("select rowid from url_list").fetchall() url_ids = (url_id[0] for url_id in url_ids) for url_id in url_ids: - words_count = self.con.execute("select word_id, count(word_id) from word_location where " - " url_id = %d group by word_id" % url_id).fetchall() - words_dict = {pair[0]: pair[1] for pair in words_count} + words_count = self.con.execute("select word_id, count(word_id), idf from word_location join " + " word_list on word_location.word_id = word_list.rowid where " + " url_id = %d group by word_id" % url_id).fetchall() + words_dict = {record[0]: record[1] for record in words_count} + words_idf = {record[0]: record[2] for record in words_count} + sum_of_words = sum(words_dict[word] for word in words_dict) + length = 0 for word in words_dict: - length = length + words_dict[word] * words_dict[word] + length += pow(words_dict[word] * words_idf[word], 2) length = math.sqrt(length) + length = length / sum_of_words self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_id)) self.db_commit() diff --git a/searcher.py b/searcher.py index 864ed7a..94e5419 100644 --- a/searcher.py +++ b/searcher.py @@ -28,12 +28,15 @@ def __del__(self): def db_commit(self): self.con.commit() - def separate_words(self, text): + @staticmethod + def separate_words(text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s != ''] - def get_top_words(self, words, n): - '''Return top n words in text''' + @staticmethod + def get_top_words(words, n): + '''Return top n words in text + Return list of pairs (word, num of repetition)''' words_top = {word: 0 for word in words} for word in words: words_top[word] += 1 @@ -44,7 +47,8 @@ def get_top_words(self, words, n): else: return sorted_top[0: n] - def get_top_n_from_dict(self, dic, n): + @staticmethod + def get_top_n_from_dict(dic, n): '''Return list of pairs (key, value) sorted top n values from dictionary dic ''' heap = [] @@ -68,16 +72,23 @@ def tf(self, words): sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1), reverse=True) return sorted_top + def idf(self, word): + '''Return idf of word''' + idf = self.con.execute("select idf from word_list where word = '%s'" % word).fetchone() + if idf is None: + return 0 + else: + return idf[0] def get_top_tf_idf(self, words, n): '''Get top n words from list words using tf * idf''' tf_list = self.tf(words) top_dict = {pair[0]: pair[1] for pair in tf_list} for word in top_dict: - idf = self.con.execute("select idf from word_list where word = '%s'" % word).fetchone()[0] + idf = self.idf(word) top_dict[word] = top_dict[word] * idf - sorted_top = self.get_top_n_from_dict(top_dict, n) + sorted_top = Searcher.get_top_n_from_dict(top_dict, n) return sorted_top @@ -86,8 +97,9 @@ def get_url_by_id(self, url_id): url = self.con.execute("select url from url_list where rowid = '%s'" % url_id).fetchone()[0] return url - def count_length(self, l): - '''Return length of the vector saved in list of pairs''' + @staticmethod + def count_length(l): + '''Return Euclidean norm of the vector, saved in list of pairs''' words_dict = {pair[0]: pair[1] for pair in l} length = 0 for word in words_dict: @@ -96,51 +108,54 @@ def count_length(self, l): length = math.sqrt(length) return length - def cos_distance(self, url_id, words_tf_idf, length): - '''Count cos distance between list of pairs (word, tf * idf) - and article with id url = url_id - length is a length of vector, it doesn't affect the order of documents, but it makes - cos values between 0 and 1''' - - url_words = self.con.execute("select word from word_list join word_location on " - "word_list.rowid = word_location.word_id where " - "word_location.url_id = %s" % url_id).fetchall() - - url_words = [pair[0] for pair in url_words] - url_words_counted = self.get_top_words(url_words, len(url_words)) + @staticmethod + def cos_distance(url_words_tf_idf, url_words_length, words_tf_idf, words_length): + ''' Count cos distance between two vectors of words + url_words_tf_idf - list of pairs (word, tf * idf), where words are words from url + url_words_length - norm of vector url_words_tf_idf + words_tf_idf - list of pairs (word, tf * idf), where words are words from text + words_length - norm of vector words_tf_idf ''' words_dict = {pair[0]: pair[1] for pair in words_tf_idf} - url_words_dict = {pair[0]: pair[1] for pair in url_words_counted} + url_words_dict = {pair[0]: pair[1] for pair in url_words_tf_idf} - sum = 0 + sc_product = 0 for word in words_dict: if word in url_words_dict: - sum = sum + words_dict[word] * url_words_dict[word] + sc_product += words_dict[word] * url_words_dict[word] - url_length = self.con.execute("select length from url_list where rowid = %d" % url_id).fetchone()[0] - cos = sum / (url_length * length) - return cos + return sc_product / (url_words_length * words_length) def cos_search(self, text, n): - ''' Start search by cos distance between text and documents + ''' Start search by cos distance between text and documents(urls) n means taking n top words from the text - top is counted by (number of repeats) * (idf) ''' + top is counted by (tf * idf) ''' - all_words = self.separate_words(text) - words_top_tf_idf = self.get_top_tf_idf(all_words, n) - print words_top_tf_idf - length = self.count_length(words_top_tf_idf) - #list of pairs + text_words = Searcher.separate_words(text) + text_words_tf_idf = self.get_top_tf_idf(text_words, n) + text_length = Searcher.count_length(text_words_tf_idf) url_ids = self.con.execute("select rowid from url_list").fetchall() url_ids = [url_id[0] for url_id in url_ids] + url_count = len(url_ids) print "Number of documents is %d" % url_count print "Searching..." - url_cos = {url_id: self.cos_distance(url_id, words_top_tf_idf, length) for url_id in url_ids} - top_n = self.get_top_n_from_dict(url_cos, self.SHOW_ANSWER) + url_cos = {} + for url_id in url_ids: + url_words = self.con.execute("select word from word_list join word_location on " + " word_list.rowid = word_location.word_id where " + " word_location.url_id = %s" % url_id).fetchall() + + url_words = [pair[0] for pair in url_words] + url_words_tf_idf = self.get_top_tf_idf(url_words, len(url_words)) + url_length = self.con.execute("select length from url_list where rowid = %d" % url_id).fetchone()[0] + + url_cos[url_id] = Searcher.cos_distance(url_words_tf_idf, url_length, text_words_tf_idf, text_length) + + top_n = Searcher.get_top_n_from_dict(url_cos, self.SHOW_ANSWER) print "Answer: " for url_id in top_n: diff --git a/start_search b/start_search index 69f89d3..a31ae2a 100755 --- a/start_search +++ b/start_search @@ -2,6 +2,6 @@ import searcher s = searcher.Searcher('db_acm') -s.cos_search("Quality of service (QoS) can be a critical element for achieving the business goals of a service provider, for the acceptance of a service by the user, or for guaranteeing service characteristics in a composition of services, where a service is defined as either a software or a software-support (i.e., infrastructural) service which is available on any type of network or electronic channel. The goal of this article is to compare the approaches to QoS description in the literature, where several models and metamodels are included. consider a large spectrum of models and metamodels to describe service quality, ranging from ontological approaches to define quality measures, metrics, and dimensions, to metamodels enabling the specification of quality-based service requirements and capabilities as well as of SLAs (Service-Level Agreements) and SLA templates for service provisioning.", 10) +s.cos_search("Quality of service merry christmas a lot of strange words service", 10) From f709f4aacf40a8ec8114216fbb09891eaae9506b Mon Sep 17 00:00:00 2001 From: elizabeth Date: Thu, 26 Dec 2013 13:14:31 +0400 Subject: [PATCH 13/22] Small correction --- searcher.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/searcher.py b/searcher.py index 94e5419..d83c227 100644 --- a/searcher.py +++ b/searcher.py @@ -124,6 +124,9 @@ def cos_distance(url_words_tf_idf, url_words_length, words_tf_idf, words_length) if word in url_words_dict: sc_product += words_dict[word] * url_words_dict[word] + if url_words_length == 0 or words_length == 0: + return 0 + return sc_product / (url_words_length * words_length) From 364f485cbed45800818293141019de69d5e46b89 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Sat, 28 Dec 2013 23:30:49 +0400 Subject: [PATCH 14/22] Corrections again --- search.py | 6 ++++++ searcher.py | 56 +++++++++++++++++++++++++---------------------------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/search.py b/search.py index 70a54c2..73fa36e 100755 --- a/search.py +++ b/search.py @@ -193,6 +193,9 @@ def crawl(self, journal_url, depth=2): count = 1 for link in links: + #DEBUG + if count > 5: + break ref = self.delete_user_info(link['href']) print "==============" print " Journal link: " + self.BASE + ref @@ -201,6 +204,9 @@ def crawl(self, journal_url, depth=2): list_of_papers = self.get_list_of_links(self.BASE + list_vol) for paper in list_of_papers: + #DEBUG + if count > 5: + break paper_ref = self.delete_user_info(paper['href']) if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)): paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME) diff --git a/searcher.py b/searcher.py index d83c227..37001ca 100644 --- a/searcher.py +++ b/searcher.py @@ -47,30 +47,15 @@ def get_top_words(words, n): else: return sorted_top[0: n] - @staticmethod - def get_top_n_from_dict(dic, n): - '''Return list of pairs (key, value) - sorted top n values from dictionary dic ''' - heap = [] - for word in dic: - if len(heap) < n: - heapq.heappush(heap, (dic[word], word)) - else: - heapq.heappushpop(heap, (dic[word], word)) - heap.sort(reverse=True) - inverted = [(pair[1], pair[0]) for pair in heap] - return inverted - def tf(self, words): - '''Return words sorted by their tf''' + '''Return tf of words''' words_top = {word: 0 for word in words} for word in words: words_top[word] += 1 - words_top = {word: words_top[word] / float(len(words)) for word in words_top} + words_freq = {word: words_top[word] / float(len(words)) for word in words_top} - sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1), reverse=True) - return sorted_top + return words_freq def idf(self, word): '''Return idf of word''' @@ -82,14 +67,20 @@ def idf(self, word): def get_top_tf_idf(self, words, n): '''Get top n words from list words using tf * idf''' - tf_list = self.tf(words) - top_dict = {pair[0]: pair[1] for pair in tf_list} - for word in top_dict: + tf_dict = self.tf(words) + heap = [] + + for word in tf_dict: idf = self.idf(word) - top_dict[word] = top_dict[word] * idf + tf_idf = tf_dict[word] * idf + if len(heap) < n: + heapq.heappush(heap, (tf_idf, word)) + else: + heapq.heappushpop(heap, (tf_idf, word)) - sorted_top = Searcher.get_top_n_from_dict(top_dict, n) - return sorted_top + heap.sort(reverse=True) + inverted = [(pair[1], pair[0]) for pair in heap] + return inverted def get_url_by_id(self, url_id): @@ -116,6 +107,9 @@ def cos_distance(url_words_tf_idf, url_words_length, words_tf_idf, words_length) words_tf_idf - list of pairs (word, tf * idf), where words are words from text words_length - norm of vector words_tf_idf ''' + if url_words_length == 0 or words_length == 0: + return 0 + words_dict = {pair[0]: pair[1] for pair in words_tf_idf} url_words_dict = {pair[0]: pair[1] for pair in url_words_tf_idf} @@ -124,9 +118,6 @@ def cos_distance(url_words_tf_idf, url_words_length, words_tf_idf, words_length) if word in url_words_dict: sc_product += words_dict[word] * url_words_dict[word] - if url_words_length == 0 or words_length == 0: - return 0 - return sc_product / (url_words_length * words_length) @@ -146,7 +137,7 @@ def cos_search(self, text, n): print "Number of documents is %d" % url_count print "Searching..." - url_cos = {} + heap = [] for url_id in url_ids: url_words = self.con.execute("select word from word_list join word_location on " " word_list.rowid = word_location.word_id where " @@ -155,10 +146,15 @@ def cos_search(self, text, n): url_words = [pair[0] for pair in url_words] url_words_tf_idf = self.get_top_tf_idf(url_words, len(url_words)) url_length = self.con.execute("select length from url_list where rowid = %d" % url_id).fetchone()[0] + url_cos = Searcher.cos_distance(url_words_tf_idf, url_length, text_words_tf_idf, text_length) - url_cos[url_id] = Searcher.cos_distance(url_words_tf_idf, url_length, text_words_tf_idf, text_length) + if len(heap) < self.SHOW_ANSWER: + heapq.heappush(heap, (url_cos, url_id)) + else: + heapq.heappushpop(heap, (url_cos, url_id)) - top_n = Searcher.get_top_n_from_dict(url_cos, self.SHOW_ANSWER) + heap.sort(reverse=True) + top_n = [(pair[1], pair[0]) for pair in heap] print "Answer: " for url_id in top_n: From f61daa5a01e9d57510efd5941010cbf863b6acc7 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Sun, 29 Dec 2013 00:30:11 +0400 Subject: [PATCH 15/22] sql query correction --- search.py | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/search.py b/search.py index 73fa36e..8aa157c 100755 --- a/search.py +++ b/search.py @@ -194,8 +194,8 @@ def crawl(self, journal_url, depth=2): count = 1 for link in links: #DEBUG - if count > 5: - break + #if count > 5: + # break ref = self.delete_user_info(link['href']) print "==============" print " Journal link: " + self.BASE + ref @@ -205,8 +205,8 @@ def crawl(self, journal_url, depth=2): for paper in list_of_papers: #DEBUG - if count > 5: - break + #if count > 5: + # break paper_ref = self.delete_user_info(paper['href']) if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)): paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME) @@ -242,23 +242,18 @@ def count_vectors_length(self): '''Count vector's length for each url (Euclidean norm of tf * idf for each word in url) Set this value to the table url_list''' print "Counting lengths..." - url_ids = self.con.execute("select rowid from url_list").fetchall() - url_ids = (url_id[0] for url_id in url_ids) - - for url_id in url_ids: - words_count = self.con.execute("select word_id, count(word_id), idf from word_location join " - " word_list on word_location.word_id = word_list.rowid where " - " url_id = %d group by word_id" % url_id).fetchall() - words_dict = {record[0]: record[1] for record in words_count} - words_idf = {record[0]: record[2] for record in words_count} - sum_of_words = sum(words_dict[word] for word in words_dict) - - length = 0 - for word in words_dict: - length += pow(words_dict[word] * words_idf[word], 2) - length = math.sqrt(length) - length = length / sum_of_words - self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_id)) + + url_count = self.con.execute("select url_id, sum(wcount), sum(count_idf) from " + "(select url_id, count(location) as wcount, " + "count(location) * count(location) * word_list.idf * word_list.idf as count_idf " + "from word_location join word_list on word_location.word_id=word_list.rowid " + "group by url_id, word_id) T1 " + "group by T1.url_id").fetchall() + + for url_record in url_count: + length = math.sqrt(url_record[2]) + length = length / url_record[1] + self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_record[0])) self.db_commit() From 7b4082773bddcdb8ddfbe16ff41e55dec7e79676 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Thu, 23 Jan 2014 23:26:02 +0400 Subject: [PATCH 16/22] db updating and authors are added --- crawl_acm | 2 +- search.py | 59 ++++++++++++++++++++++++++++++++++++---------------- searcher.py | 9 +++++--- start_search | 2 +- 4 files changed, 49 insertions(+), 23 deletions(-) diff --git a/crawl_acm b/crawl_acm index 5756822..b4907cb 100755 --- a/crawl_acm +++ b/crawl_acm @@ -3,7 +3,7 @@ import search c = search.Crawler('db_acm') c.create_index_tables() -c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603") +c.crawl("http://dl.acm.org/pub.cfm?id=J204") c.count_idf() c.count_vectors_length() diff --git a/search.py b/search.py index 8aa157c..7c769a5 100755 --- a/search.py +++ b/search.py @@ -60,6 +60,19 @@ def get_entry_id(self, table, field, value, create_new=True): else: return res[0] + def get_entry_id_url_list(self, url, title, authors, create_new=True): + """ Return id of row in table if this row exists + Else create this row and return id""" + cur = self.con.execute( + "select rowid from url_list where url = '%s'" % url) + res = cur.fetchone() + if res is None: + cur = self.con.execute( + "insert into url_list (url, title, authors) values ('%s', '%s', '%s')" % (url, title, authors)) + return cur.lastrowid + else: + return res[0] + def get_text_only(self, soup): """ Return text from Soup of page""" v = soup.string @@ -78,7 +91,7 @@ def separate_words(self, text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s != ''] - def add_to_index(self, url, text, title, count): + def add_to_index(self, url, text, title, authors, count): """ Add all words from text (from url) to database. This url becomes indexed """ if self.is_indexed(url): @@ -91,7 +104,6 @@ def add_to_index(self, url, text, title, count): words = [] if title is not None: - #print title words = self.separate_words(title) if (len(text) < 50) and (self.ABS_NOT_AVAILABLE in text): @@ -101,7 +113,7 @@ def add_to_index(self, url, text, title, count): for word in words_from_abstract: words.append(word) - url_id = self.get_entry_id('url_list', 'url', url) + url_id = self.get_entry_id_url_list(url, title, authors) for i in range(len(words)): word = words[i] @@ -159,8 +171,13 @@ def get_title(self, url): print "I can't get title of: %s" % url return soup = BeautifulSoup(con.read()) - title = soup.title.string - return title + + authors = soup.findAll(attrs={"name":"citation_authors"}) + #print "Authors: %s" % authors[0]['content'] + title = soup.findAll(attrs={"name":"citation_title"}) + #print "Title: %s" % title[0]['content'] + return title[0]['content'], authors[0]['content'] + def get_abstract_text(self, url): """ Return text of article's abstract""" @@ -194,8 +211,11 @@ def crawl(self, journal_url, depth=2): count = 1 for link in links: #DEBUG - #if count > 5: - # break + if count > 10: + break + if not (link['href'].startswith("citation")): + continue + ref = self.delete_user_info(link['href']) print "==============" print " Journal link: " + self.BASE + ref @@ -205,17 +225,26 @@ def crawl(self, journal_url, depth=2): for paper in list_of_papers: #DEBUG - #if count > 5: - # break + if count > 10: + break paper_ref = self.delete_user_info(paper['href']) + if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)): + ref = self.BASE + paper_ref + is_already_indexed = self.con.execute("select rowid from url_list where url = '%s'" % + ref).fetchone() + if is_already_indexed is not None: + print "%4d %s is already indexed" % (count, ref) + count += 1 + continue + paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME) if paper_abstract is None: continue text = self.get_abstract_text(self.BASE + paper_abstract) - title = self.get_title(self.BASE + paper_ref) + meta = self.get_title(self.BASE + paper_ref) - self.add_to_index(self.BASE + paper_ref, text, title, count) + self.add_to_index(self.BASE + paper_ref, text, meta[0], meta[1], count) count += 1 self.db_commit() @@ -262,15 +291,9 @@ def create_index_tables(self): """ Create database tables """ res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone() if res is not None: - self.con.execute('delete from url_list') - self.con.execute('delete from word_list') - self.con.execute('delete from word_location') - self.con.execute('delete from link') - self.con.execute('delete from link_words') - self.db_commit() return - self.con.execute('create table url_list(url, length)') + self.con.execute('create table url_list(url, length, title, authors)') self.con.execute('create table word_list(word, idf)') self.con.execute('create table word_location(url_id, word_id, location)') self.con.execute('create table link(from_id integer, to_id integer)') diff --git a/searcher.py b/searcher.py index 37001ca..d8488d3 100644 --- a/searcher.py +++ b/searcher.py @@ -85,7 +85,7 @@ def get_top_tf_idf(self, words, n): def get_url_by_id(self, url_id): '''Return url by its id''' - url = self.con.execute("select url from url_list where rowid = '%s'" % url_id).fetchone()[0] + url = self.con.execute("select url, title, authors from url_list where rowid = '%s'" % url_id).fetchall()[0] return url @staticmethod @@ -158,6 +158,9 @@ def cos_search(self, text, n): print "Answer: " for url_id in top_n: - print "id = %4d cos = %f" % (url_id[0], url_id[1]) - print self.get_url_by_id(url_id[0]) + if url_id[1] > 0: + print "id = %4d cos = %f" % (url_id[0], url_id[1]) + print self.get_url_by_id(url_id[0])[0] + print self.get_url_by_id(url_id[0])[1] + print self.get_url_by_id(url_id[0])[2] diff --git a/start_search b/start_search index a31ae2a..5ecee9c 100755 --- a/start_search +++ b/start_search @@ -2,6 +2,6 @@ import searcher s = searcher.Searcher('db_acm') -s.cos_search("Quality of service merry christmas a lot of strange words service", 10) +s.cos_search("Quality of service service service service service ", 10) From b9506f0c7cee5f9029c6fbb479eb467ffba8d6b0 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Fri, 24 Jan 2014 01:02:52 +0400 Subject: [PATCH 17/22] Journals and issues are added --- crawl_acm | 2 +- search.py | 45 ++++++++++++++++++++++++++++++++++----------- searcher.py | 17 ++++++++++++----- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/crawl_acm b/crawl_acm index b4907cb..3e1fb71 100755 --- a/crawl_acm +++ b/crawl_acm @@ -3,7 +3,7 @@ import search c = search.Crawler('db_acm') c.create_index_tables() -c.crawl("http://dl.acm.org/pub.cfm?id=J204") +c.crawl("http://dl.acm.org/pub.cfm?id=J204", "ACM Computing Surveys (CSUR)") c.count_idf() c.count_vectors_length() diff --git a/search.py b/search.py index 7c769a5..4b3dd94 100755 --- a/search.py +++ b/search.py @@ -60,15 +60,29 @@ def get_entry_id(self, table, field, value, create_new=True): else: return res[0] - def get_entry_id_url_list(self, url, title, authors, create_new=True): + def get_entry_id_url_list(self, url, title, authors, issue_id, create_new=True): """ Return id of row in table if this row exists - Else create this row and return id""" + Else create this row and return id for url""" cur = self.con.execute( "select rowid from url_list where url = '%s'" % url) res = cur.fetchone() if res is None: cur = self.con.execute( - "insert into url_list (url, title, authors) values ('%s', '%s', '%s')" % (url, title, authors)) + "insert into url_list (url, title, authors, issue_id) values ('%s', '%s', '%s', '%s')" + % (url, title, authors, issue_id)) + return cur.lastrowid + else: + return res[0] + + def get_entry_id_issue(self, url, name, jour_id, create_new=True): + """ Return id of row in table if this row exists + Else create this row and return id for issue""" + cur = self.con.execute( + "select rowid from issue where url = '%s'" % url) + res = cur.fetchone() + if res is None: + cur = self.con.execute( + "insert into issue (jour_id, name, url) values ('%s', '%s', '%s')" % (jour_id, name, url)) return cur.lastrowid else: return res[0] @@ -91,7 +105,7 @@ def separate_words(self, text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s != ''] - def add_to_index(self, url, text, title, authors, count): + def add_to_index(self, url, text, title, authors, count, issue_id): """ Add all words from text (from url) to database. This url becomes indexed """ if self.is_indexed(url): @@ -113,7 +127,7 @@ def add_to_index(self, url, text, title, authors, count): for word in words_from_abstract: words.append(word) - url_id = self.get_entry_id_url_list(url, title, authors) + url_id = self.get_entry_id_url_list(url, title, authors, issue_id) for i in range(len(words)): word = words[i] @@ -197,9 +211,12 @@ def delete_user_info(self, url): new_url = url[0: ind] return new_url - def crawl(self, journal_url, depth=2): + def crawl(self, journal_url, name, depth=2): """ Begin crawling journal in ACM Library """ + print " Journal link: " + journal_url + journal_id = self.get_entry_id('journal', 'name', name) + link = self.open_tab(journal_url, self.ARCHIVE_TAB_NAME) if link is None: return @@ -210,22 +227,26 @@ def crawl(self, journal_url, depth=2): count = 1 for link in links: + info = link.string + #DEBUG - if count > 10: + if count > 20: break if not (link['href'].startswith("citation")): continue ref = self.delete_user_info(link['href']) + issue_id = self.get_entry_id_issue(self.BASE + ref, info, journal_id) + print "==============" - print " Journal link: " + self.BASE + ref + print " Issue link: " + self.BASE + ref print "==============" list_vol = self.open_tab(self.BASE + ref, self.TABLE_OF_CONTENTS_TAB_NAME) list_of_papers = self.get_list_of_links(self.BASE + list_vol) for paper in list_of_papers: #DEBUG - if count > 10: + if count > 20: break paper_ref = self.delete_user_info(paper['href']) @@ -244,7 +265,7 @@ def crawl(self, journal_url, depth=2): text = self.get_abstract_text(self.BASE + paper_abstract) meta = self.get_title(self.BASE + paper_ref) - self.add_to_index(self.BASE + paper_ref, text, meta[0], meta[1], count) + self.add_to_index(self.BASE + paper_ref, text, meta[0], meta[1], count, issue_id) count += 1 self.db_commit() @@ -293,7 +314,9 @@ def create_index_tables(self): if res is not None: return - self.con.execute('create table url_list(url, length, title, authors)') + self.con.execute('create table url_list(url, length, title, authors, issue_id)') + self.con.execute('create table issue(jour_id, name, url)') + self.con.execute('create table journal(name)') self.con.execute('create table word_list(word, idf)') self.con.execute('create table word_location(url_id, word_id, location)') self.con.execute('create table link(from_id integer, to_id integer)') diff --git a/searcher.py b/searcher.py index d8488d3..6c31190 100644 --- a/searcher.py +++ b/searcher.py @@ -85,7 +85,10 @@ def get_top_tf_idf(self, words, n): def get_url_by_id(self, url_id): '''Return url by its id''' - url = self.con.execute("select url, title, authors from url_list where rowid = '%s'" % url_id).fetchall()[0] + url = self.con.execute("select url_list.url, title, authors, journal.name, issue.name from url_list " + "join issue on url_list.issue_id = issue.rowid " + "join journal on issue.jour_id = journal.rowid " + "where url_list.rowid = '%s'" % url_id).fetchall()[0] return url @staticmethod @@ -157,10 +160,14 @@ def cos_search(self, text, n): top_n = [(pair[1], pair[0]) for pair in heap] print "Answer: " + number = 1 for url_id in top_n: if url_id[1] > 0: - print "id = %4d cos = %f" % (url_id[0], url_id[1]) - print self.get_url_by_id(url_id[0])[0] - print self.get_url_by_id(url_id[0])[1] - print self.get_url_by_id(url_id[0])[2] + print "%2d. id = %4d cos = %f" % (number, url_id[0], url_id[1]) + number += 1 + print " " + self.get_url_by_id(url_id[0])[0] + print " " + self.get_url_by_id(url_id[0])[1] + print " " + self.get_url_by_id(url_id[0])[2] + print " " + self.get_url_by_id(url_id[0])[3] + print " " + self.get_url_by_id(url_id[0])[4] From c6f764a428905658f9eb08c83fb6dcb919054d78 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Fri, 24 Jan 2014 15:19:43 +0400 Subject: [PATCH 18/22] Some bugs fixed --- search.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/search.py b/search.py index 4b3dd94..650347c 100755 --- a/search.py +++ b/search.py @@ -187,10 +187,18 @@ def get_title(self, url): soup = BeautifulSoup(con.read()) authors = soup.findAll(attrs={"name":"citation_authors"}) + if len(authors) > 0: + authors = authors[0]['content'] + else: + authors = "Authors are unknown" #print "Authors: %s" % authors[0]['content'] title = soup.findAll(attrs={"name":"citation_title"}) + if len(title) > 0: + title = title[0]['content'] + else: + title = "Title is unknown" #print "Title: %s" % title[0]['content'] - return title[0]['content'], authors[0]['content'] + return title, authors def get_abstract_text(self, url): @@ -230,8 +238,8 @@ def crawl(self, journal_url, name, depth=2): info = link.string #DEBUG - if count > 20: - break + #if count > 20: + # break if not (link['href'].startswith("citation")): continue @@ -246,8 +254,8 @@ def crawl(self, journal_url, name, depth=2): for paper in list_of_papers: #DEBUG - if count > 20: - break + #if count > 20: + # break paper_ref = self.delete_user_info(paper['href']) if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)): From 962821718a36dc36c9c95fe9aa98012aed1d3710 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Sat, 8 Feb 2014 13:39:10 +0400 Subject: [PATCH 19/22] Cutting bad documents --- searcher.py | 74 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 10 deletions(-) diff --git a/searcher.py b/searcher.py index 6c31190..381bd44 100644 --- a/searcher.py +++ b/searcher.py @@ -18,6 +18,9 @@ class Searcher: SHOW_ANSWER = 10 + MAGIC_NUMBER = 20 + IDF_BOUND = 1 + COS_BOUND = 0.1 def __init__(self, db_name): self.con = sqlite.connect(db_name) @@ -33,19 +36,61 @@ def separate_words(text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s != ''] - @staticmethod - def get_top_words(words, n): - '''Return top n words in text - Return list of pairs (word, num of repetition)''' + + def get_top_words(self, words, n): + '''Return top n tf * idf words in text + Return list of words ''' words_top = {word: 0 for word in words} for word in words: words_top[word] += 1 + for word in words_top: + word_idf = self.con.execute( + "select idf from word_list where word = '%s'" % word).fetchone() + if word_idf is None: + words_top[word] = 0 + else: + word_idf = word_idf[0] + if word_idf > self.IDF_BOUND: + words_top[word] = words_top[word] * word_idf + else: + words_top[word] = 0 + + words_top = {word: words_top[word] for word in words_top if words_top[word] > 0} + sorted_top = sorted(words_top.iteritems(), key=operator.itemgetter(1), reverse=True) - if len(sorted_top) <= n: - return sorted_top + clear_list = [pair[0] for pair in sorted_top] + if len(clear_list) <= n: + return clear_list else: - return sorted_top[0: n] + return clear_list[0: n] + + + def find_rows(self, words): + '''Find documents which contain one of words''' + if len(words) == 0: + return [] + + word_id_list = [] + table_num = 0 + clause_list = '' + + for word in words: + word_row = self.con.execute( + "select rowid from word_list where word = '%s'" % word).fetchone() + if word_row is not None: + word_id = word_row[0] + #print "word_id: %d" % word_id + word_id_list.append(word_id) + if table_num > 0: + clause_list += ' or ' + clause_list += 'word_id = %d' % word_id + table_num += 1 + + query = 'select distinct url_id from word_location where %s ' % clause_list + result = self.con.execute(query) + rows = [row for row in result] + return rows def tf(self, words): @@ -133,15 +178,21 @@ def cos_search(self, text, n): text_words_tf_idf = self.get_top_tf_idf(text_words, n) text_length = Searcher.count_length(text_words_tf_idf) - url_ids = self.con.execute("select rowid from url_list").fetchall() - url_ids = [url_id[0] for url_id in url_ids] + top_text_words = self.get_top_words(text_words, self.MAGIC_NUMBER) + url_ids = self.find_rows(top_text_words) + url_ids = [url_id[0] for url_id in url_ids] url_count = len(url_ids) - print "Number of documents is %d" % url_count + + url_full_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] + print "Number of documents: %d " % url_full_count + + print "Number of documents after cutting: %d " % url_count print "Searching..." heap = [] for url_id in url_ids: + #print url_id url_words = self.con.execute("select word from word_list join word_location on " " word_list.rowid = word_location.word_id where " " word_location.url_id = %s" % url_id).fetchall() @@ -151,6 +202,9 @@ def cos_search(self, text, n): url_length = self.con.execute("select length from url_list where rowid = %d" % url_id).fetchone()[0] url_cos = Searcher.cos_distance(url_words_tf_idf, url_length, text_words_tf_idf, text_length) + if url_cos < self.COS_BOUND: + continue + if len(heap) < self.SHOW_ANSWER: heapq.heappush(heap, (url_cos, url_id)) else: From cffce249f1d57f6a7ba90cbfa2c3ceec9221b55a Mon Sep 17 00:00:00 2001 From: Dmitry Barashev Date: Mon, 10 Feb 2014 14:53:03 +0400 Subject: [PATCH 20/22] return json formatted output --- searcher.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/searcher.py b/searcher.py index 381bd44..65067bd 100644 --- a/searcher.py +++ b/searcher.py @@ -15,6 +15,7 @@ import operator import math import heapq +import sys class Searcher: SHOW_ANSWER = 10 @@ -185,10 +186,10 @@ def cos_search(self, text, n): url_count = len(url_ids) url_full_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] - print "Number of documents: %d " % url_full_count + print >> sys.stderr, "Number of documents: %d " % url_full_count - print "Number of documents after cutting: %d " % url_count - print "Searching..." + print >> sys.stderr, "Number of documents after cutting: %d " % url_count + print >> sys.stderr, "Searching..." heap = [] for url_id in url_ids: @@ -213,15 +214,21 @@ def cos_search(self, text, n): heap.sort(reverse=True) top_n = [(pair[1], pair[0]) for pair in heap] - print "Answer: " + print '{"articles": [' number = 1 for url_id in top_n: if url_id[1] > 0: - print "%2d. id = %4d cos = %f" % (number, url_id[0], url_id[1]) + if number > 1: + print "," + article_data = self.get_url_by_id(url_id[0]) + print "{" + print '"docid": %4d,' % url_id[0] + print '"rank": %f, ' % url_id[1] + print '"url": "%s", ' % article_data[0] + print '"title": "%s", ' % article_data[1] + print '"authors": "%s", ' % article_data[2].encode('utf-8') + print '"journal": "%s", ' % article_data[3] + print '"issue": "%s" ' % article_data[4] + print "}" number += 1 - print " " + self.get_url_by_id(url_id[0])[0] - print " " + self.get_url_by_id(url_id[0])[1] - print " " + self.get_url_by_id(url_id[0])[2] - print " " + self.get_url_by_id(url_id[0])[3] - print " " + self.get_url_by_id(url_id[0])[4] - + print "]}" From e0b690c472b9392c98efd74b3921dd639321bc81 Mon Sep 17 00:00:00 2001 From: elizabeth Date: Tue, 11 Feb 2014 00:13:47 +0400 Subject: [PATCH 21/22] count cos distance in sql query --- searcher.py | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/searcher.py b/searcher.py index 65067bd..d4384b3 100644 --- a/searcher.py +++ b/searcher.py @@ -67,14 +67,15 @@ def get_top_words(self, words, n): return clear_list[0: n] - def find_rows(self, words): - '''Find documents which contain one of words''' + def find_rows(self, words, words_tfidf): + '''Find documents which contain one of words and count cos distance''' if len(words) == 0: return [] word_id_list = [] table_num = 0 - clause_list = '' + #clause_list = '' + query = 'select url_id, sum(tfidf * word_count) / length from (' for word in words: word_row = self.con.execute( @@ -84,13 +85,18 @@ def find_rows(self, words): #print "word_id: %d" % word_id word_id_list.append(word_id) if table_num > 0: - clause_list += ' or ' - clause_list += 'word_id = %d' % word_id + query += ' union ' + query += 'select url_id, word_id, %f AS tfidf, count(location) AS word_count , length ' \ + 'FROM word_location WL JOIN url_list UL ON (WL.url_id = UL.rowid and word_id = %d) ' \ + 'GROUP BY url_id, word_id, length' % (words_tfidf[word], word_id) table_num += 1 - query = 'select distinct url_id from word_location where %s ' % clause_list + query += ') group by url_id ' +# print query result = self.con.execute(query) rows = [row for row in result] + +# print rows return rows @@ -180,10 +186,10 @@ def cos_search(self, text, n): text_length = Searcher.count_length(text_words_tf_idf) top_text_words = self.get_top_words(text_words, self.MAGIC_NUMBER) - url_ids = self.find_rows(top_text_words) + url_ids_cos = self.find_rows(top_text_words, {word: idf for (word, idf) in text_words_tf_idf}) - url_ids = [url_id[0] for url_id in url_ids] - url_count = len(url_ids) + #url_ids = [url_id[0] for url_id in url_ids] + url_count = len(url_ids_cos) url_full_count = self.con.execute("select count(rowid) from url_list").fetchone()[0] print >> sys.stderr, "Number of documents: %d " % url_full_count @@ -192,6 +198,7 @@ def cos_search(self, text, n): print >> sys.stderr, "Searching..." heap = [] + url_ids = [] for url_id in url_ids: #print url_id url_words = self.con.execute("select word from word_list join word_location on " @@ -211,6 +218,16 @@ def cos_search(self, text, n): else: heapq.heappushpop(heap, (url_cos, url_id)) + heap = [] + for url_pair in url_ids_cos: + url_cos = url_pair[1] + url_id = url_pair[0] + if len(heap) < self.SHOW_ANSWER: + heapq.heappush(heap, (url_cos, url_id)) + else: + heapq.heappushpop(heap, (url_cos, url_id)) + + heap.sort(reverse=True) top_n = [(pair[1], pair[0]) for pair in heap] @@ -231,4 +248,4 @@ def cos_search(self, text, n): print '"issue": "%s" ' % article_data[4] print "}" number += 1 - print "]}" + print "]}" \ No newline at end of file From 98118dea2f08519a88f3fdcb93836d9aed85697b Mon Sep 17 00:00:00 2001 From: elizabeth Date: Mon, 17 Feb 2014 12:29:43 +0400 Subject: [PATCH 22/22] Quick search with sql query maybe correct --- searcher.py | 72 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/searcher.py b/searcher.py index d4384b3..ddf25c6 100644 --- a/searcher.py +++ b/searcher.py @@ -67,7 +67,7 @@ def get_top_words(self, words, n): return clear_list[0: n] - def find_rows(self, words, words_tfidf): + def find_rows(self, words, words_tf_idf_idf): '''Find documents which contain one of words and count cos distance''' if len(words) == 0: return [] @@ -75,28 +75,32 @@ def find_rows(self, words, words_tfidf): word_id_list = [] table_num = 0 #clause_list = '' - query = 'select url_id, sum(tfidf * word_count) / length from (' + query = 'SELECT url_id, denorm_rank / url_list.length AS rank FROM (' \ + ' SELECT url_id, sum/count(*) AS denorm_rank FROM (' + fat_query = 'SELECT url_id as urlid, sum(weight) AS sum, count(*) AS match_words_num FROM (' for word in words: word_row = self.con.execute( - "select rowid from word_list where word = '%s'" % word).fetchone() + "SELECT rowid FROM word_list WHERE word = '%s'" % word).fetchone() if word_row is not None: word_id = word_row[0] - #print "word_id: %d" % word_id word_id_list.append(word_id) if table_num > 0: - query += ' union ' - query += 'select url_id, word_id, %f AS tfidf, count(location) AS word_count , length ' \ - 'FROM word_location WL JOIN url_list UL ON (WL.url_id = UL.rowid and word_id = %d) ' \ - 'GROUP BY url_id, word_id, length' % (words_tfidf[word], word_id) + fat_query += ' UNION ' + fat_query += 'SELECT url_id, count(*) * %f AS weight ' \ + 'FROM word_location WHERE word_id = %d ' \ + 'GROUP BY url_id' % (words_tf_idf_idf[word], word_id) table_num += 1 - query += ') group by url_id ' + fat_query += ') GROUP BY urlid ' + query += fat_query + query += ') as FatQuery JOIN word_location ON (FatQuery.urlid = word_location.url_id)' \ + 'GROUP BY urlid, sum, match_words_num)' \ + 'JOIN url_list ON url_list.rowid = url_id' # print query +# print fat_query result = self.con.execute(query) rows = [row for row in result] - -# print rows return rows @@ -117,22 +121,22 @@ def idf(self, word): else: return idf[0] - def get_top_tf_idf(self, words, n): - '''Get top n words from list words using tf * idf''' + def get_top_tf_idf(self, words, top_words): + '''Return values of tf * idf * idf for words in top_words''' tf_dict = self.tf(words) - heap = [] + idf_dict = {word: 0 for word in top_words} for word in tf_dict: - idf = self.idf(word) - tf_idf = tf_dict[word] * idf - if len(heap) < n: - heapq.heappush(heap, (tf_idf, word)) + if word in top_words: + idf = self.idf(word) + idf_dict[word] = idf + tf_dict[word] = tf_dict[word] * idf * idf else: - heapq.heappushpop(heap, (tf_idf, word)) + tf_dict[word] = 0 - heap.sort(reverse=True) - inverted = [(pair[1], pair[0]) for pair in heap] - return inverted + tf_dict = {word: tf_dict[word] for word in tf_dict if tf_dict[word] > 0} + inverted = [(word, tf_dict[word]) for word in tf_dict] + return inverted, idf_dict def get_url_by_id(self, url_id): @@ -144,9 +148,9 @@ def get_url_by_id(self, url_id): return url @staticmethod - def count_length(l): + def count_length(l, idfs): '''Return Euclidean norm of the vector, saved in list of pairs''' - words_dict = {pair[0]: pair[1] for pair in l} + words_dict = {pair[0]: (pair[1] / idfs[pair[0]]) for pair in l} length = 0 for word in words_dict: length = length + words_dict[word] * words_dict[word] @@ -182,11 +186,18 @@ def cos_search(self, text, n): top is counted by (tf * idf) ''' text_words = Searcher.separate_words(text) - text_words_tf_idf = self.get_top_tf_idf(text_words, n) - text_length = Searcher.count_length(text_words_tf_idf) + top_text_words = self.get_top_words(text_words, n) + + answer = self.get_top_tf_idf(text_words, top_text_words) + text_words_tf_idf_idf = answer[0] + top_idfs = answer[1] - top_text_words = self.get_top_words(text_words, self.MAGIC_NUMBER) - url_ids_cos = self.find_rows(top_text_words, {word: idf for (word, idf) in text_words_tf_idf}) + text_length = Searcher.count_length(text_words_tf_idf_idf, top_idfs) + +# print top_text_words +# print text_words_tf_idf_idf + + url_ids_cos = self.find_rows(top_text_words, {word: tf_idf_idf for (word, tf_idf_idf) in text_words_tf_idf_idf}) #url_ids = [url_id[0] for url_id in url_ids] url_count = len(url_ids_cos) @@ -197,6 +208,7 @@ def cos_search(self, text, n): print >> sys.stderr, "Number of documents after cutting: %d " % url_count print >> sys.stderr, "Searching..." + ''' heap = [] url_ids = [] for url_id in url_ids: @@ -217,10 +229,10 @@ def cos_search(self, text, n): heapq.heappush(heap, (url_cos, url_id)) else: heapq.heappushpop(heap, (url_cos, url_id)) - + ''' heap = [] for url_pair in url_ids_cos: - url_cos = url_pair[1] + url_cos = url_pair[1] / text_length url_id = url_pair[0] if len(heap) < self.SHOW_ANSWER: heapq.heappush(heap, (url_cos, url_id))