-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalysis.py
153 lines (125 loc) · 6.03 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import MySQLdb as mdb
import _mysql_exceptions
import redis
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from modules.crawling import crawl
from database_info import mysql_info, redis_info
from time import strftime, time, sleep
flogname = './logs/analysis.log'
queries = {
'get_max_post' : 'SELECT MAX(_id) FROM posts',
'get_max_post_by_cond' : 'SELECT MAX(_id) FROM posts WHERE topic_id = %s AND source_id IN (%s)',
'get_rule_words' : 'SELECT category_seq, rule_id, (MAX(word_seq) - MIN(word_seq)) AS gap, GROUP_CONCAT(word_id ORDER BY word_seq SEPARATOR ",") AS words \
FROM (SELECT r.category_seq, rw.rule_id, rw.word_seq, rw.word_id \
FROM rule_word_relations AS rw \
JOIN rules AS r ON r._id = rw.rule_id \
ORDER BY word_seq) AS t \
GROUP BY rule_id',
'get_sentence_words' : 'SELECT t1.post_id, t1.sentence_seq, GROUP_CONCAT(word_seq ORDER BY word_seq SEPARATOR ",") AS word_seqs, \
GROUP_CONCAT(word_id ORDER BY word_seq SEPARATOR ",") AS word_ids \
FROM (SELECT s.post_id, s.sentence_seq, r.word_seq, r.word_id FROM sentences AS s \
JOIN sentence_word_relations AS r ON s.post_id = r.post_id AND s.sentence_seq = r.sentence_seq \
JOIN (SELECT * FROM posts WHERE _id > %s AND _id < %s) AS p ON p._id = s.post_id \
WHERE r.word_id IN (%s) ORDER BY word_seq) AS t1 \
JOIN (SELECT s.post_id, s.sentence_seq, COUNT(*) AS cat_count \
FROM sentences AS s \
JOIN sentence_word_relations AS r ON s.post_id = r.post_id AND s.sentence_seq = r.sentence_seq \
WHERE r.word_id IN (%s) \
GROUP BY s.post_id, s.sentence_seq having cat_count >= %s) AS t2 \
ON t1.post_id = t2.post_id AND t1.sentence_seq = t2.sentence_seq \
GROUP BY post_id, sentence_seq',
'insert_result' : 'INSERT IGNORE INTO rule_sentence_relations (rule_id, post_id, sentence_seq)\
VALUES (%s, %s, %s)'
}
def connect_db():
return mdb.connect(**mysql_info())
def connect_rd():
rd = redis.StrictRedis(**redis_info())
return rd
def get_time():
return strftime('%Y-%m-%d %H:%M:%S')
def logging(text):
with open(flogname, 'a') as log: log.write(('[%s] ' % get_time()) + text + '\n')
def is_valid_sentences(gap, rule_word_ids, word_seqs, sentence_word_ids):
previous_word_id = rule_word_ids[0]
previous_position = -1
i, j = 0, 0
while(i < len(rule_word_ids) and j < len(sentence_word_ids)):
rule_word_id = rule_word_ids[i]
sentence_word_id = sentence_word_ids[j]
if sentence_word_id == rule_word_id:
if i == 0 or word_seqs[j] - previous_position <= gap:
previous_word_id = rule_word_id
previous_position = word_seqs[j]
i += 1
j += 1
continue
if sentence_word_id == previous_word_id:
previous_position = word_seqs[j]
j += 1
continue
j += 1
if i == len(rule_word_ids):
return True
else:
return False
def formatstring(arr):
if type(arr) == list:
return ', '.join(['%s']*len(arr))
else:
return '%s'
def get_post_until(topic=None, sources_ids=None):
format_string = '%s'
if sources_ids:
format_string = formatstring(sources_ids)
with connect_db() as cur:
if topic:
cur.execute(queries['get_max_post_by_cond'] % ('%s', format_string), [topic]+sources_ids)
else:
cur.execute(queries['get_max_post'])
post_until = int(cur.fetchall()[0][0])
return post_until
def get_rule_words():
with connect_db() as cur:
cur.execute(queries['get_rule_words'])
return cur.fetchall()
def analyze(topic=None, sources_ids=None):
cputime = time()
post_until = get_post_until(topic, sources_ids)
for row_rule in get_rule_words():
rd = connect_rd()
db = connect_db()
cur = db.cursor()
category_seq = int(row_rule[0])
rule_id = int(row_rule[1])
gap = int(row_rule[2])
rule_word_ids = map(int, row_rule[3].split(','))
rule_word_id_format_string = formatstring(rule_word_ids)
last_post_id = 0
if rd.exists(rule_id):
last_post_id = int(rd.get(rule_id))
cur.execute(queries['get_sentence_words']\
%('%s', '%s', rule_word_id_format_string, rule_word_id_format_string, '%s'),\
[last_post_id, post_until] + rule_word_ids + rule_word_ids + [len(rule_word_ids)])
for row_sentence in cur.fetchall():
post_id = int(row_sentence[0])
sentence_seq = int(row_sentence[1])
word_seqs = map(int, row_sentence[2].split(','))
sentence_word_ids = map(int, row_sentence[3].split(','))
if is_valid_sentences(gap, rule_word_ids, word_seqs, sentence_word_ids):
cur.execute(queries['insert_result'], (rule_id, post_id, sentence_seq))
db.commit()
db.close()
post_until = max(post_until, last_post_id)
rd.set(rule_id, post_until)
logging ('rule(%s) post(%s, %s) takes %s'\
% (str(rule_id), str(last_post_id), str(post_until), str(time()-cputime)))
logging ('run until post_id %s, takes %s seconds' % (post_until, str(time()-cputime)))
if __name__ == '__main__':
while(True):
logging ('analyze start!!!')
analyze()
logging ('sleep')
sleep(60*20)