add 201707-201708 blog and code

MoodMAX · Sep 16, 2018 · b855411 · b855411
1 parent 3afff3c
commit b855411
Show file tree

Hide file tree

Showing 2 changed files with 266 additions and 0 deletions.
diff --git a/natural-language-processing/word2vec.ipynb b/natural-language-processing/word2vec.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copyright (C) 2016 - 2019 Pinard Liu([email protected])\n",
+    "\n",
+    "https://www.cnblogs.com/pinard\n",
+    "\n",
+    "Permission given to modify the code as long as you keep this declaration at the top\n",
+    "\n",
+    "用gensim学习word2vec https://www.cnblogs.com/pinard/p/7278324.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# -*- coding: utf-8 -*-\n",
+    "\n",
+    "import jieba\n",
+    "import jieba.analyse\n",
+    "\n",
+    "jieba.suggest_freq('沙瑞金', True)\n",
+    "jieba.suggest_freq('田国富', True)\n",
+    "jieba.suggest_freq('高育良', True)\n",
+    "jieba.suggest_freq('侯亮平', True)\n",
+    "jieba.suggest_freq('钟小艾', True)\n",
+    "jieba.suggest_freq('陈岩石', True)\n",
+    "jieba.suggest_freq('欧阳菁', True)\n",
+    "jieba.suggest_freq('易学习', True)\n",
+    "jieba.suggest_freq('王大路', True)\n",
+    "jieba.suggest_freq('蔡成功', True)\n",
+    "jieba.suggest_freq('孙连城', True)\n",
+    "jieba.suggest_freq('季昌明', True)\n",
+    "jieba.suggest_freq('丁义珍', True)\n",
+    "jieba.suggest_freq('郑西坡', True)\n",
+    "jieba.suggest_freq('赵东来', True)\n",
+    "jieba.suggest_freq('高小琴', True)\n",
+    "jieba.suggest_freq('赵瑞龙', True)\n",
+    "jieba.suggest_freq('林华华', True)\n",
+    "jieba.suggest_freq('陆亦可', True)\n",
+    "jieba.suggest_freq('刘新建', True)\n",
+    "jieba.suggest_freq('刘庆祝', True)\n",
+    "\n",
+    "with open('./in_the_name_of_people.txt') as f:\n",
+    "    document = f.read()\n",
+    "    \n",
+    "    #document_decode = document.decode('GBK')\n",
+    "    \n",
+    "    document_cut = jieba.cut(document)\n",
+    "    #print  ' '.join(jieba_cut)  //如果打印结果，则分词效果消失，后面的result无法显示\n",
+    "    result = ' '.join(document_cut)\n",
+    "    result = result.encode('utf-8')\n",
+    "    with open('./in_the_name_of_people_segment.txt', 'w') as f2:\n",
+    "        f2.write(result)\n",
+    "f.close()\n",
+    "f2.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import modules & set up logging\n",
+    "import logging\n",
+    "import os\n",
+    "from gensim.models import word2vec\n",
+    "\n",
+    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
+    "\n",
+    "sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt') \n",
+    "\n",
+    "model = word2vec.Word2Vec(sentences, hs=1,min_count=1,window=3,size=100)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "祁同伟 0.977946519852\n",
+      "赵东来 0.969956457615\n",
+      "侯亮平 0.96396112442\n",
+      "沙瑞金 0.959288835526\n",
+      "易学习 0.95690870285\n"
+     ]
+    }
+   ],
+   "source": [
+    "req_count = 5\n",
+    "for key in model.wv.similar_by_word('李达康'.decode('utf-8'), topn =100):\n",
+    "    if len(key[0])==3:\n",
+    "        req_count -= 1\n",
+    "        print key[0], key[1]\n",
+    "        if req_count == 0:\n",
+    "            break;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "李达康 0.969956457615\n",
+      "陆亦可 0.969528734684\n",
+      "赵瑞龙 0.966745913029\n",
+      "祁同伟 0.965694904327\n",
+      "蔡成功 0.961919903755\n"
+     ]
+    }
+   ],
+   "source": [
+    "req_count = 5\n",
+    "for key in model.wv.similar_by_word('赵东来'.decode('utf-8'), topn =100):\n",
+    "    if len(key[0])==3:\n",
+    "        req_count -= 1\n",
+    "        print key[0], key[1]\n",
+    "        if req_count == 0:\n",
+    "            break;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "沙瑞金 0.961137533188\n",
+      "开玩笑 0.935248732567\n",
+      "祁同伟 0.931046843529\n",
+      "李达康 0.921290516853\n",
+      "打电话 0.916399776936\n"
+     ]
+    }
+   ],
+   "source": [
+    "req_count = 5\n",
+    "for key in model.wv.similar_by_word('高育良'.decode('utf-8'), topn =100):\n",
+    "    if len(key[0])==3:\n",
+    "        req_count -= 1\n",
+    "        print key[0], key[1]\n",
+    "        if req_count == 0:\n",
+    "            break;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "高育良 0.967257142067\n",
+      "李达康 0.959131598473\n",
+      "田国富 0.953414440155\n",
+      "易学习 0.943500876427\n",
+      "祁同伟 0.942932963371\n"
+     ]
+    }
+   ],
+   "source": [
+    "req_count = 5\n",
+    "for key in model.wv.similar_by_word('沙瑞金'.decode('utf-8'), topn =100):\n",
+    "    if len(key[0])==3:\n",
+    "        req_count -= 1\n",
+    "        print key[0], key[1]\n",
+    "        if req_count == 0:\n",
+    "            break;"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.961137455325\n",
+      "0.935589365706\n"
+     ]
+    }
+   ],
+   "source": [
+    "print model.wv.similarity('沙瑞金'.decode('utf-8'), '高育良'.decode('utf-8'))\n",
+    "print model.wv.similarity('李达康'.decode('utf-8'), '王大路'.decode('utf-8'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "刘庆祝\n"
+     ]
+    }
+   ],
+   "source": [
+    "print model.wv.doesnt_match(u\"沙瑞金 高育良 李达康 刘庆祝\".split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    " "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/readme.md b/readme.md
@@ -192,5 +192,10 @@ http://www.cnblogs.com/pinard 刘建平Pinard
 [条件随机场CRF(一)从随机场到线性链条件随机场](https://www.cnblogs.com/pinard/p/7048333.html)|无
 [条件随机场CRF(二) 前向后向算法评估标记序列概率](https://www.cnblogs.com/pinard/p/7055072.html)|无
 [条件随机场CRF(三) 模型学习与维特比算法解码](https://www.cnblogs.com/pinard/p/7068574.html)|无
+[word2vec原理(一) CBOW与Skip-Gram模型基础])(https://www.cnblogs.com/pinard/p/7160330.html)|无
+[word2vec原理(二) 基于Hierarchical Softmax的模型](https://www.cnblogs.com/pinard/p/7243513.html)|无
+[word2vec原理(三) 基于Negative Sampling的模型](https://www.cnblogs.com/pinard/p/7249903.html)|(https://github.com/ljpzzz/machinelearning/blob/master/natural-language-processing/word2vec.ipynb)
+[用gensim学习word2vec](https://www.cnblogs.com/pinard/p/7278324.html)|
+
 
 License MIT.