forked from ljpzzz/machinelearning
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
pinard.liu
committed
Sep 16, 2018
1 parent
3afff3c
commit b855411
Showing
2 changed files
with
266 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,261 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Copyright (C) 2016 - 2019 Pinard Liu([email protected])\n", | ||
"\n", | ||
"https://www.cnblogs.com/pinard\n", | ||
"\n", | ||
"Permission given to modify the code as long as you keep this declaration at the top\n", | ||
"\n", | ||
"用gensim学习word2vec https://www.cnblogs.com/pinard/p/7278324.html" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# -*- coding: utf-8 -*-\n", | ||
"\n", | ||
"import jieba\n", | ||
"import jieba.analyse\n", | ||
"\n", | ||
"jieba.suggest_freq('沙瑞金', True)\n", | ||
"jieba.suggest_freq('田国富', True)\n", | ||
"jieba.suggest_freq('高育良', True)\n", | ||
"jieba.suggest_freq('侯亮平', True)\n", | ||
"jieba.suggest_freq('钟小艾', True)\n", | ||
"jieba.suggest_freq('陈岩石', True)\n", | ||
"jieba.suggest_freq('欧阳菁', True)\n", | ||
"jieba.suggest_freq('易学习', True)\n", | ||
"jieba.suggest_freq('王大路', True)\n", | ||
"jieba.suggest_freq('蔡成功', True)\n", | ||
"jieba.suggest_freq('孙连城', True)\n", | ||
"jieba.suggest_freq('季昌明', True)\n", | ||
"jieba.suggest_freq('丁义珍', True)\n", | ||
"jieba.suggest_freq('郑西坡', True)\n", | ||
"jieba.suggest_freq('赵东来', True)\n", | ||
"jieba.suggest_freq('高小琴', True)\n", | ||
"jieba.suggest_freq('赵瑞龙', True)\n", | ||
"jieba.suggest_freq('林华华', True)\n", | ||
"jieba.suggest_freq('陆亦可', True)\n", | ||
"jieba.suggest_freq('刘新建', True)\n", | ||
"jieba.suggest_freq('刘庆祝', True)\n", | ||
"\n", | ||
"with open('./in_the_name_of_people.txt') as f:\n", | ||
" document = f.read()\n", | ||
" \n", | ||
" #document_decode = document.decode('GBK')\n", | ||
" \n", | ||
" document_cut = jieba.cut(document)\n", | ||
" #print ' '.join(jieba_cut) //如果打印结果,则分词效果消失,后面的result无法显示\n", | ||
" result = ' '.join(document_cut)\n", | ||
" result = result.encode('utf-8')\n", | ||
" with open('./in_the_name_of_people_segment.txt', 'w') as f2:\n", | ||
" f2.write(result)\n", | ||
"f.close()\n", | ||
"f2.close()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 64, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# import modules & set up logging\n", | ||
"import logging\n", | ||
"import os\n", | ||
"from gensim.models import word2vec\n", | ||
"\n", | ||
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", | ||
"\n", | ||
"sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt') \n", | ||
"\n", | ||
"model = word2vec.Word2Vec(sentences, hs=1,min_count=1,window=3,size=100) " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 65, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"祁同伟 0.977946519852\n", | ||
"赵东来 0.969956457615\n", | ||
"侯亮平 0.96396112442\n", | ||
"沙瑞金 0.959288835526\n", | ||
"易学习 0.95690870285\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"req_count = 5\n", | ||
"for key in model.wv.similar_by_word('李达康'.decode('utf-8'), topn =100):\n", | ||
" if len(key[0])==3:\n", | ||
" req_count -= 1\n", | ||
" print key[0], key[1]\n", | ||
" if req_count == 0:\n", | ||
" break;" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 66, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"李达康 0.969956457615\n", | ||
"陆亦可 0.969528734684\n", | ||
"赵瑞龙 0.966745913029\n", | ||
"祁同伟 0.965694904327\n", | ||
"蔡成功 0.961919903755\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"req_count = 5\n", | ||
"for key in model.wv.similar_by_word('赵东来'.decode('utf-8'), topn =100):\n", | ||
" if len(key[0])==3:\n", | ||
" req_count -= 1\n", | ||
" print key[0], key[1]\n", | ||
" if req_count == 0:\n", | ||
" break;" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 67, | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"沙瑞金 0.961137533188\n", | ||
"开玩笑 0.935248732567\n", | ||
"祁同伟 0.931046843529\n", | ||
"李达康 0.921290516853\n", | ||
"打电话 0.916399776936\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"req_count = 5\n", | ||
"for key in model.wv.similar_by_word('高育良'.decode('utf-8'), topn =100):\n", | ||
" if len(key[0])==3:\n", | ||
" req_count -= 1\n", | ||
" print key[0], key[1]\n", | ||
" if req_count == 0:\n", | ||
" break;" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 63, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"高育良 0.967257142067\n", | ||
"李达康 0.959131598473\n", | ||
"田国富 0.953414440155\n", | ||
"易学习 0.943500876427\n", | ||
"祁同伟 0.942932963371\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"req_count = 5\n", | ||
"for key in model.wv.similar_by_word('沙瑞金'.decode('utf-8'), topn =100):\n", | ||
" if len(key[0])==3:\n", | ||
" req_count -= 1\n", | ||
" print key[0], key[1]\n", | ||
" if req_count == 0:\n", | ||
" break;" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 77, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0.961137455325\n", | ||
"0.935589365706\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print model.wv.similarity('沙瑞金'.decode('utf-8'), '高育良'.decode('utf-8'))\n", | ||
"print model.wv.similarity('李达康'.decode('utf-8'), '王大路'.decode('utf-8'))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 76, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"刘庆祝\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print model.wv.doesnt_match(u\"沙瑞金 高育良 李达康 刘庆祝\".split())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
" " | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters