-
Notifications
You must be signed in to change notification settings - Fork 1
/
model.py
170 lines (148 loc) · 4.98 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
from __future__ import print_function, unicode_literals
from gensim.models import KeyedVectors
from stanfordcorenlp import StanfordCoreNLP
from pyltp import NamedEntityRecognizer,Postagger,Segmentor
from pprint import pprint as pp
import pickle as pkl
base_path = '/root/files/'
file_path = base_path + 'news_content.csv'
nlp_path = '/root/stanford-corenlp'
word2vec_model_path = base_path + 'sgns.merge.word.bz2'
segmentor = Segmentor()
segmentor.load('/root/files/ltp_data_v3.4.0/cws.model')
postagger = Postagger()
postagger.load('/root/files/ltp_data_v3.4.0/pos.model')
recongnizer = NamedEntityRecognizer()
recongnizer.load('/root/files/ltp_data_v3.4.0/ner.model')
r = None
with open('./saved_files/r.pkl','rb') as f:
r = pkl.load(f)
end = {'。','!','?'}
say_represents = r
update = {'的','和','但','是','也'}
say_represents = say_represents - update
# convert ltp to bosonnlp foramt: {'word':words,'entity':[[start,end,org_name]]}
def ltp_ner(news):
mapper = {'Ni':'org_name','Nh':'person_name'}
words = segmentor.segment(news)
postags = postagger.postag(words)
nertags = recongnizer.recognize(words,postags)
entity = []
k = 0
while k < len(words):
if nertags[k] == 'O' or nertags[k].endswith('Ns'):
k += 1
elif nertags[k].startswith('S'):
entity.append([k,k+1,mapper[nertags[k][2:]]])
k += 1
elif nertags[k].startswith('B'):
start = k
while k < len(words) and not nertags[k].startswith('E'):k+=1
entity.append([start,k+1,mapper[nertags[k][2:]]])
k += 1
else:
k += 1
return {
'word':list(words),
'entity':entity
}
# search the related keywords that can represens 说.
# alreadly saved to saved_files/r.pkl
def search(query, depth=3):
r = {}
all_r = []
topn = 10
s = wm.most_similar(query, topn=topn)
r[0] = {query: s}
for d in range(1, depth):
_v = r[d - 1]
s = {}
for k, v in _v.items():
for i in v:
s[i[0]] = wm.most_similar(i[0], topn=topn)
r[d] = s
for _, v in r.items():
for _, k in v.items():
all_r += k
all_r = sorted(all_r, key=lambda x: x[1], reverse=True)
all_r = set([i[0] for i in all_r])
pp(all_r)
return all_r
def contains_say_keywords(c):
return any(True if i in c else False for i in say_represents)
def all_say_keywords(c):
return [i for i in c if i in say_represents]
# find the most nearest entity, and update the points. TODO: bad implements, need refactor.
def correct_p(p):
# currectly fetch the first keyword
k_index = p[1].find(p[2][0])
e_index = p[1].find(p[0])
ner = uniteNER(p[1])
e = ner[1]
ret = p[0]
clost = k_index - e_index - 1
for start,end in e.items():
if k_index < end:
continue
if (k_index - end) < clost:
# update
ret = ner[2][start:end]
clost = k_index - end
return [''.join(ret),p[1],p[2]]
# uniteNER, currently use the ltp as backend.
def uniteNER(news):
ner = ltp_ner(news)
words = ner['word']
entity = ner['entity']
N = []
# record the entity start and end. k:v = start : end
entity_start = {}
for e in entity:
if e[2] in {'org_name','person_name'}:
entity_start[e[0]] = e[1]
N.append([''.join(words[e[0]:e[1]]),e[2]])
return N, entity_start, words
def bosonnlpNER(news):
from bosonnlp import BosonNLP
nlp = BosonNLP('cKWUytiR.34676.f5F2YbS_EyX2')
ner = nlp.ner(news)[0]
print(ner)
words = ner['word']
entity = ner['entity']
N = []
# record the entity start and end. k:v = start : end
entity_start = {}
for e in entity:
if e[2] in {'org_name','person_name'}:
entity_start[e[0]] = e[1]
N.append([''.join(words[e[0]:e[1]]),e[2]])
return N, entity_start, words
def extract_points(news,correct=False):
points = []
ners, es, words = uniteNER(news)
k = 0
while k < len(words):
if k in es:
# entity start, fetch the sentence
# first add the entity
_p = [''.join(words[k:es[k]])]
# second add the sentence
start = k
# find the finish signal
while k < len(words) and words[k] not in end:k+=1
_p.append(words[start:k])
points.append(_p)
k += 1
# filter the none points sentences.
# currently methods is naive: filter the sentence by keywords which can represent says
points = list(filter(lambda x: contains_say_keywords(x[1]),points))
keywords = [all_say_keywords(p[1]) for p in points]
points = [[i[0],''.join(i[1]),k] for i,k in zip(points,keywords)]
if correct:
points = [correct_p(i) for i in points]
return points
def process_news(news):
return news.replace('\n', '')
def fetch(news):
n = process_news(news)
return extract_points(n),extract_points(n,correct=True)