Skip to content

Commit 498ce8d

Browse files
add annotation
1 parent 7ea14d9 commit 498ce8d

File tree

4 files changed

+23
-18
lines changed

4 files changed

+23
-18
lines changed

README.md

+12-8
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
# Get 50 news from test data,find each of them the most similar 50 news in train data.
22

3-
- enviroment :\
4-
win 10\
5-
anaconda 4.3.1\
6-
python 3\
7-
jieba 0.39\
8-
gensim 3.4.0
3+
## enviroment
4+
- win 10
5+
- anaconda 4.3.1
6+
- python 3
7+
- jieba 0.39
8+
- gensim 3.4.0
99

10-
- demo :\
11-
cmd python get_top20.py
10+
## TODO
11+
12+
- calculate the sentence-meaning vector for each news instead of calculate the similarity for each tow of them to cut down the calculated quantity.
13+
- cluster the news to implement TFIDF on each news to find out the key words.
14+
## demo
15+
- cmd python get_top20.py

lib/calc_test_data.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
def calc_test_data():
88
model_path = cfg.model_output_path
99
print('loading model from {}'.format(model_path))
10-
model = Word2Vec.load(model_path)
11-
vocab = list(model.wv.vocab.keys())
10+
model = Word2Vec.load(model_path)#加载模型
11+
vocab = list(model.wv.vocab.keys())#获得语料字典
1212
test_data_path = cfg.test_df_path
1313
print('loading test data from {}'.format(test_data_path))
1414
test = pd.read_csv(test_data_path,encoding='gbk')
@@ -22,7 +22,7 @@ def calc_test_data():
2222
for i in range(test.shape[0]):
2323
raw = test['title'][i]
2424
l1 = hw.jieba_fenci(raw,stopwords_list)
25-
l1 = hw.clear_list(l1,vocab)
25+
l1 = hw.clear_list(l1,vocab)#清洗列表,确保列表里的词都在字典里
2626
test_words.append(l1)
2727

2828
anss = np.zeros((50,485686))
@@ -49,7 +49,7 @@ def calc_test_data():
4949
# test['top20'][i] = str(top)
5050
for i in range(50):
5151
ans = anss[i,:]
52-
top = heapq.nlargest(20,range(len(ans)),ans.__getitem__)
52+
top = heapq.nlargest(20,range(len(ans)),ans.__getitem__)#获得相似度排名前20的标题的索引
5353
an['top20'][i] = top
5454

55-
test.to_csv(cfg.top20_path,mode='a')
55+
test.to_csv(cfg.top20_path,mode='a')#保存结果

lib/train_model.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ def train_model():
77
train_df_path = cfg.train_df_path
88
model_output_path = cfg.model_output_path
99
print('preparing words')
10-
get_words(words_path,train_df_path)
10+
get_words(words_path,train_df_path)#准备语料
1111
print('train model using {}'.format(train_df_path))
1212
model_output_path = cfg.model_output_path
1313
model = Word2Vec(LineSentence(words_path),
1414
size=cfg.train_size, window=cfg.train_window,
15-
min_count=cfg.train_min_count, workers=cfg.train_workers)
15+
min_count=cfg.train_min_count, workers=cfg.train_workers)#训练
1616

1717
model.save(model_output_path)
1818

utils/get_words.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22
import pandas as pd
33

44

5-
stopwords_list = hw.stop_words()
5+
stopwords_list = hw.stop_words()#加载stop_words
66
def get_words(words_path,df_path):
7+
#用df_path文件中的句子来构建语料库
78
df = pd.read_csv(df_path)
89
with open(words_path,'wb') as y:
910
for i in range(df.shape[0]):
10-
w_l = hw.jieba_fenci(df['title'][i],stopwords_list)
11+
w_l = hw.jieba_fenci(df['title'][i],stopwords_list)#进行分词
1112
#print(w_l)
1213
for n in range(len(w_l)):
13-
y.write(w_l[n].encode('utf-8'))
14+
y.write(w_l[n].encode('utf-8'))#删除stopwords
1415
y.write('\n'.encode('utf-8'))
1516
if (i+1)%10000==0:
1617
#break

0 commit comments

Comments
 (0)