Skip to content

Commit

Permalink
sql query correction
Browse files Browse the repository at this point in the history
  • Loading branch information
Elizaveta239 committed Dec 28, 2013
1 parent 364f485 commit f61daa5
Showing 1 changed file with 16 additions and 21 deletions.
37 changes: 16 additions & 21 deletions search.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,8 @@ def crawl(self, journal_url, depth=2):
count = 1
for link in links:
#DEBUG
if count > 5:
break
#if count > 5:
# break
ref = self.delete_user_info(link['href'])
print "=============="
print " Journal link: " + self.BASE + ref
Expand All @@ -205,8 +205,8 @@ def crawl(self, journal_url, depth=2):

for paper in list_of_papers:
#DEBUG
if count > 5:
break
#if count > 5:
# break
paper_ref = self.delete_user_info(paper['href'])
if (len(dict(paper.attrs)) == 1) and (paper_ref.startswith(self.IS_PAPER_LINK)):
paper_abstract = self.open_tab(self.BASE + paper_ref, self.ABSTRACT_TAB_NAME)
Expand Down Expand Up @@ -242,23 +242,18 @@ def count_vectors_length(self):
'''Count vector's length for each url (Euclidean norm of tf * idf for each word in url)
Set this value to the table url_list'''
print "Counting lengths..."
url_ids = self.con.execute("select rowid from url_list").fetchall()
url_ids = (url_id[0] for url_id in url_ids)

for url_id in url_ids:
words_count = self.con.execute("select word_id, count(word_id), idf from word_location join "
" word_list on word_location.word_id = word_list.rowid where "
" url_id = %d group by word_id" % url_id).fetchall()
words_dict = {record[0]: record[1] for record in words_count}
words_idf = {record[0]: record[2] for record in words_count}
sum_of_words = sum(words_dict[word] for word in words_dict)

length = 0
for word in words_dict:
length += pow(words_dict[word] * words_idf[word], 2)
length = math.sqrt(length)
length = length / sum_of_words
self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_id))

url_count = self.con.execute("select url_id, sum(wcount), sum(count_idf) from "
"(select url_id, count(location) as wcount, "
"count(location) * count(location) * word_list.idf * word_list.idf as count_idf "

This comment has been minimized.

Copy link
@dbarashev

dbarashev Dec 30, 2013

Contributor

It is the sum of squares of (term count * idf), right? Shall it be the sum of squares of tf*idf ? count(location) / total number of words in url_id ?

"from word_location join word_list on word_location.word_id=word_list.rowid "
"group by url_id, word_id) T1 "
"group by T1.url_id").fetchall()

for url_record in url_count:
length = math.sqrt(url_record[2])
length = length / url_record[1]
self.con.execute("update url_list set length = %f where rowid = %d" % (length, url_record[0]))

This comment has been minimized.

Copy link
@dbarashev

dbarashev Dec 30, 2013

Contributor

by the way, you can try updating url_list using the same query which you use for calculations. See http://stackoverflow.com/questions/2334712/update-from-select-using-sql-server
Not sure if sqlite supports it, though


self.db_commit()

Expand Down

0 comments on commit f61daa5

Please sign in to comment.