Skip to content

Commit

Permalink
add 201707-201708 blog and code
Browse files Browse the repository at this point in the history
  • Loading branch information
pinard.liu committed Sep 16, 2018
1 parent 3afff3c commit b855411
Show file tree
Hide file tree
Showing 2 changed files with 266 additions and 0 deletions.
261 changes: 261 additions & 0 deletions natural-language-processing/word2vec.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (C) 2016 - 2019 Pinard Liu([email protected])\n",
"\n",
"https://www.cnblogs.com/pinard\n",
"\n",
"Permission given to modify the code as long as you keep this declaration at the top\n",
"\n",
"用gensim学习word2vec https://www.cnblogs.com/pinard/p/7278324.html"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# -*- coding: utf-8 -*-\n",
"\n",
"import jieba\n",
"import jieba.analyse\n",
"\n",
"jieba.suggest_freq('沙瑞金', True)\n",
"jieba.suggest_freq('田国富', True)\n",
"jieba.suggest_freq('高育良', True)\n",
"jieba.suggest_freq('侯亮平', True)\n",
"jieba.suggest_freq('钟小艾', True)\n",
"jieba.suggest_freq('陈岩石', True)\n",
"jieba.suggest_freq('欧阳菁', True)\n",
"jieba.suggest_freq('易学习', True)\n",
"jieba.suggest_freq('王大路', True)\n",
"jieba.suggest_freq('蔡成功', True)\n",
"jieba.suggest_freq('孙连城', True)\n",
"jieba.suggest_freq('季昌明', True)\n",
"jieba.suggest_freq('丁义珍', True)\n",
"jieba.suggest_freq('郑西坡', True)\n",
"jieba.suggest_freq('赵东来', True)\n",
"jieba.suggest_freq('高小琴', True)\n",
"jieba.suggest_freq('赵瑞龙', True)\n",
"jieba.suggest_freq('林华华', True)\n",
"jieba.suggest_freq('陆亦可', True)\n",
"jieba.suggest_freq('刘新建', True)\n",
"jieba.suggest_freq('刘庆祝', True)\n",
"\n",
"with open('./in_the_name_of_people.txt') as f:\n",
" document = f.read()\n",
" \n",
" #document_decode = document.decode('GBK')\n",
" \n",
" document_cut = jieba.cut(document)\n",
" #print ' '.join(jieba_cut) //如果打印结果,则分词效果消失,后面的result无法显示\n",
" result = ' '.join(document_cut)\n",
" result = result.encode('utf-8')\n",
" with open('./in_the_name_of_people_segment.txt', 'w') as f2:\n",
" f2.write(result)\n",
"f.close()\n",
"f2.close()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# import modules & set up logging\n",
"import logging\n",
"import os\n",
"from gensim.models import word2vec\n",
"\n",
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
"\n",
"sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt') \n",
"\n",
"model = word2vec.Word2Vec(sentences, hs=1,min_count=1,window=3,size=100) "
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"祁同伟 0.977946519852\n",
"赵东来 0.969956457615\n",
"侯亮平 0.96396112442\n",
"沙瑞金 0.959288835526\n",
"易学习 0.95690870285\n"
]
}
],
"source": [
"req_count = 5\n",
"for key in model.wv.similar_by_word('李达康'.decode('utf-8'), topn =100):\n",
" if len(key[0])==3:\n",
" req_count -= 1\n",
" print key[0], key[1]\n",
" if req_count == 0:\n",
" break;"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"李达康 0.969956457615\n",
"陆亦可 0.969528734684\n",
"赵瑞龙 0.966745913029\n",
"祁同伟 0.965694904327\n",
"蔡成功 0.961919903755\n"
]
}
],
"source": [
"req_count = 5\n",
"for key in model.wv.similar_by_word('赵东来'.decode('utf-8'), topn =100):\n",
" if len(key[0])==3:\n",
" req_count -= 1\n",
" print key[0], key[1]\n",
" if req_count == 0:\n",
" break;"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"沙瑞金 0.961137533188\n",
"开玩笑 0.935248732567\n",
"祁同伟 0.931046843529\n",
"李达康 0.921290516853\n",
"打电话 0.916399776936\n"
]
}
],
"source": [
"req_count = 5\n",
"for key in model.wv.similar_by_word('高育良'.decode('utf-8'), topn =100):\n",
" if len(key[0])==3:\n",
" req_count -= 1\n",
" print key[0], key[1]\n",
" if req_count == 0:\n",
" break;"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"高育良 0.967257142067\n",
"李达康 0.959131598473\n",
"田国富 0.953414440155\n",
"易学习 0.943500876427\n",
"祁同伟 0.942932963371\n"
]
}
],
"source": [
"req_count = 5\n",
"for key in model.wv.similar_by_word('沙瑞金'.decode('utf-8'), topn =100):\n",
" if len(key[0])==3:\n",
" req_count -= 1\n",
" print key[0], key[1]\n",
" if req_count == 0:\n",
" break;"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.961137455325\n",
"0.935589365706\n"
]
}
],
"source": [
"print model.wv.similarity('沙瑞金'.decode('utf-8'), '高育良'.decode('utf-8'))\n",
"print model.wv.similarity('李达康'.decode('utf-8'), '王大路'.decode('utf-8'))"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"刘庆祝\n"
]
}
],
"source": [
"print model.wv.doesnt_match(u\"沙瑞金 高育良 李达康 刘庆祝\".split())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
5 changes: 5 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,5 +192,10 @@ http://www.cnblogs.com/pinard 刘建平Pinard
[条件随机场CRF(一)从随机场到线性链条件随机场](https://www.cnblogs.com/pinard/p/7048333.html)|无
[条件随机场CRF(二) 前向后向算法评估标记序列概率](https://www.cnblogs.com/pinard/p/7055072.html)|无
[条件随机场CRF(三) 模型学习与维特比算法解码](https://www.cnblogs.com/pinard/p/7068574.html)|无
[word2vec原理(一) CBOW与Skip-Gram模型基础])(https://www.cnblogs.com/pinard/p/7160330.html)|无
[word2vec原理(二) 基于Hierarchical Softmax的模型](https://www.cnblogs.com/pinard/p/7243513.html)|无
[word2vec原理(三) 基于Negative Sampling的模型](https://www.cnblogs.com/pinard/p/7249903.html)|(https://github.com/ljpzzz/machinelearning/blob/master/natural-language-processing/word2vec.ipynb)
[用gensim学习word2vec](https://www.cnblogs.com/pinard/p/7278324.html)|


License MIT.

0 comments on commit b855411

Please sign in to comment.