Skip to content

Commit

Permalink
idf counting replaced to crawl_acm
Browse files Browse the repository at this point in the history
  • Loading branch information
Elizaveta239 committed Dec 18, 2013
1 parent 0b22cfe commit 35a50d2
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 17 deletions.
3 changes: 2 additions & 1 deletion crawl_acm
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import search
c = search.Crawler('db_acm')
c.create_index_tables()
c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603")
c.crawl("http://dl.acm.org/pub.cfm?id=J204&CFID=261281623&CFTOKEN=70296603")
c.count_idf()


17 changes: 17 additions & 0 deletions search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import urllib2
from BeautifulSoup import *
from pysqlite2 import dbapi2 as sqlite
import math



Expand Down Expand Up @@ -215,6 +216,22 @@ def crawl(self, journal_url, depth=2):
print "%4d papers were indexed" % count


def count_idf(self):
print "Counting idf..."
url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0]
words_count = self.con.execute("select count(rowid) from word_list").fetchone()[0]
max = 0
for word_id in range(1, words_count + 1):

This comment has been minimized.

Copy link
@dbarashev

dbarashev Dec 21, 2013

Contributor

I think that this query could save you some Python code and, most likely, could be way more efficient:

-- for each word_id returns word_id and the total number of distinct url_ids associated with word_id
SELECT word_id, COUNT(DISTINCT url_id)
FROM world_location
GROUP BY word_id;

Also, it will work regardless of the word id numbering scheme (this loop makes a fragile assumption that they are sequentially numbered from 1)

urls = self.con.execute("select distinct url_id from word_location "
"where word_id = %s" % word_id).fetchall()
num = len(urls)
if num > max:
max = num
idf = math.log10(url_count / num)
self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id))
self.db_commit()


def create_index_tables(self):
""" Create database tables """
res = self.con.execute('select name from sqlite_master where type="table" and name="url_list"').fetchone()
Expand Down
15 changes: 0 additions & 15 deletions searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,6 @@ def get_top_words(self, words, n):
else:
return sorted_top[0: n]

def count_idf(self):
print "Counting idf..."
url_count = self.con.execute("select count(rowid) from url_list").fetchone()[0]
words_count = self.con.execute("select count(rowid) from word_list").fetchone()[0]
max = 0
for word_id in range(1, words_count + 1):
urls = self.con.execute("select distinct url_id from word_location "
"where word_id = %s" % word_id).fetchall()
num = len(urls)
if num > max:
max = num
idf = math.log10(url_count / num)
self.con.execute("update word_list set idf = %f where rowid = %d" % (idf, word_id))
self.db_commit()


def tf(self, words):
'''Return top n words in text'''
Expand Down
1 change: 0 additions & 1 deletion start_search
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import searcher
s = searcher.Searcher('db_acm')
s.count_idf()
s.cos_search("Quality of service (QoS) can be a critical element for achieving the business goals of a service provider, for the acceptance of a service by the user, or for guaranteeing service characteristics in a composition of services, where a service is defined as either a software or a software-support (i.e., infrastructural) service which is available on any type of network or electronic channel. The goal of this article is to compare the approaches to QoS description in the literature, where several models and metamodels are included. consider a large spectrum of models and metamodels to describe service quality, ranging from ontological approaches to define quality measures, metrics, and dimensions, to metamodels enabling the specification of quality-based service requirements and capabilities as well as of SLAs (Service-Level Agreements) and SLA templates for service provisioning.", 10)


0 comments on commit 35a50d2

Please sign in to comment.