add annotation

submarinecantfly · submarinecantfly · commit 498ce8dd9557 · 2018-05-10T13:53:12.000+08:00
diff --git a/README.md b/README.md
@@ -1,11 +1,15 @@
 # Get 50 news from test data,find each of them the most similar 50 news in train data.
 
-- enviroment :\
-win 10\
-anaconda 4.3.1\
-python 3\
-jieba 0.39\
-gensim 3.4.0
+## enviroment
+- win 10
+- anaconda 4.3.1
+- python 3
+- jieba 0.39
+- gensim 3.4.0
 
-- demo :\
-cmd python get_top20.py
+## TODO
+
+- calculate the sentence-meaning vector for each news instead of calculate the similarity for each tow of them to cut down the calculated quantity.
+- cluster the news to implement TFIDF on each news to find out the key words.
+## demo 
+- cmd python get_top20.py
diff --git a/lib/calc_test_data.py b/lib/calc_test_data.py
@@ -7,8 +7,8 @@
 def calc_test_data():
     model_path = cfg.model_output_path
     print('loading model from {}'.format(model_path))
-    model = Word2Vec.load(model_path)
-    vocab = list(model.wv.vocab.keys())
+    model = Word2Vec.load(model_path)#加载模型
+    vocab = list(model.wv.vocab.keys())#获得语料字典
     test_data_path = cfg.test_df_path
     print('loading test data from {}'.format(test_data_path))        
     test = pd.read_csv(test_data_path,encoding='gbk')
@@ -22,7 +22,7 @@ def calc_test_data():
     for i in range(test.shape[0]):
         raw = test['title'][i]
         l1 = hw.jieba_fenci(raw,stopwords_list)
-        l1 = hw.clear_list(l1,vocab)
+        l1 = hw.clear_list(l1,vocab)#清洗列表，确保列表里的词都在字典里
         test_words.append(l1)
 
     anss = np.zeros((50,485686))
@@ -49,7 +49,7 @@ def calc_test_data():
     #     test['top20'][i] = str(top)
     for i in range(50):
         ans = anss[i,:]
-        top = heapq.nlargest(20,range(len(ans)),ans.__getitem__)
+        top = heapq.nlargest(20,range(len(ans)),ans.__getitem__)#获得相似度排名前20的标题的索引
         an['top20'][i] = top
         
-        test.to_csv(cfg.top20_path,mode='a')
+        test.to_csv(cfg.top20_path,mode='a')#保存结果
diff --git a/lib/train_model.py b/lib/train_model.py
@@ -7,12 +7,12 @@ def train_model():
     train_df_path = cfg.train_df_path
     model_output_path = cfg.model_output_path
     print('preparing words')
-    get_words(words_path,train_df_path)
+    get_words(words_path,train_df_path)#准备语料
     print('train model using {}'.format(train_df_path))
     model_output_path = cfg.model_output_path
     model = Word2Vec(LineSentence(words_path),
     size=cfg.train_size, window=cfg.train_window, 
-    min_count=cfg.train_min_count, workers=cfg.train_workers)
+    min_count=cfg.train_min_count, workers=cfg.train_workers)#训练
 
     model.save(model_output_path)
 
diff --git a/utils/get_words.py b/utils/get_words.py
@@ -2,15 +2,16 @@
 import pandas as pd
 
 
-stopwords_list = hw.stop_words()
+stopwords_list = hw.stop_words()#加载stop_words
 def get_words(words_path,df_path):
+    #用df_path文件中的句子来构建语料库
     df = pd.read_csv(df_path)
     with open(words_path,'wb') as y:
         for i in range(df.shape[0]):
-            w_l = hw.jieba_fenci(df['title'][i],stopwords_list)
+            w_l = hw.jieba_fenci(df['title'][i],stopwords_list)#进行分词
             #print(w_l)
             for n in range(len(w_l)):
-                y.write(w_l[n].encode('utf-8'))
+                y.write(w_l[n].encode('utf-8'))#删除stopwords
             y.write('\n'.encode('utf-8'))
             if (i+1)%10000==0:
                 #break