From f97f2bbc04139b9e0e222215018a4241ed78c969 Mon Sep 17 00:00:00 2001
From: Zhiling Zhang <1840962220@qq.com>
Date: Thu, 8 Oct 2020 16:54:48 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=85=B3=E9=94=AE=E8=AF=8D?=
 =?UTF-8?q?=E6=8A=BD=E5=8F=96=E5=8A=9F=E8=83=BD=EF=BC=8C=E5=B9=B6=E6=8F=90?=
 =?UTF-8?q?=E4=BE=9Bbenchmark=20https://github.com/blmoistawinde/HarvestTe?=
 =?UTF-8?q?xt/issues/23=20=E5=BC=95=E5=85=A5=E5=8F=AF=E4=B8=8B=E8=BD=BD?=
 =?UTF-8?q?=E7=9A=84=E5=A4=96=E9=83=A8=E8=AF=8D=E5=85=B8=EF=BC=8C=E8=BE=85?=
 =?UTF-8?q?=E5=8A=A9=E6=96=B0=E8=AF=8D=E5=8F=91=E7=8E=B0=E6=8E=92=E9=99=A4?=
 =?UTF-8?q?=E6=97=A7=E8=AF=8D=20https://github.com/blmoistawinde/HarvestTe?=
 =?UTF-8?q?xt/issues/24?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                                 |  58 ++-
 examples/basics.py                        | 100 ++++-
 examples/kwd_benchmark/CSL.ipynb          | 523 ++++++++++++++++++++++
 harvesttext/algorithms/keyword.py         |  36 ++
 harvesttext/algorithms/word_discoverer.py |   4 +-
 harvesttext/download_utils.py             | 135 ++++++
 harvesttext/harvesttext.py                |   3 +-
 harvesttext/parsing.py                    |   2 +-
 harvesttext/resources.py                  |  30 +-
 harvesttext/word_discover.py              |  71 ++-
 10 files changed, 928 insertions(+), 34 deletions(-)
 create mode 100644 examples/kwd_benchmark/CSL.ipynb
 create mode 100644 harvesttext/algorithms/keyword.py
 create mode 100644 harvesttext/download_utils.py
diff --git a/README.md b/README.md
index 7602b6d..2ed63df 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Sow with little data seed, harvest much from a text field.
 ![GitHub stars](https://img.shields.io/github/stars/blmoistawinde/harvesttext?style=social) 
 ![PyPI - Python Version](https://img.shields.io/badge/python-3.6+-blue.svg) 
 ![GitHub](https://img.shields.io/github/license/mashape/apistatus.svg) 
-![Version](https://img.shields.io/badge/version-V0.7-red.svg)
+![Version](https://img.shields.io/badge/version-V0.8-red.svg)
 
 ## 用途
 HarvestText是一个专注无（弱）监督方法，能够整合领域知识（如类型，别名）对特定领域文本进行简单高效地处理和分析的库。适用于许多文本预处理和初步探索性分析任务，在小说分析，网络文本，专业文献等领域都有潜在应用价值。
@@ -478,6 +478,37 @@ Text summarization(避免重复)
 武磊和郜林，谁是中国最好的前锋？
 ```
 
+<a id="关键词抽取"> </a>
+
+### 关键词抽取
+
+目前提供包括`textrank`和HarvestText封装jieba并配置好参数和停用词的`jieba_tfidf`（默认）两种算法。
+
+示例(完整见[example](./examples/basics.py))：
+
+```python3
+# text为林俊杰《关键词》歌词
+print("《关键词》里的关键词")
+kwds = ht.extract_keywords(text, 5, method="jieba_tfidf")
+print("jieba_tfidf", kwds)
+kwds = ht.extract_keywords(text, 5, method="textrank")
+print("textrank", kwds)
+```
+
+```
+《关键词》里的关键词
+jieba_tfidf ['自私', '慷慨', '落叶', '消逝', '故事']
+textrank ['自私', '落叶', '慷慨', '故事', '位置']
+```
+
+[CSL.ipynb](./examples/kwd_benchmark/CSL.ipynb)提供了不同算法，以及本库的实现与[textrank4zh](https://github.com/letiantian/TextRank4ZH)的在[CSL数据集](https://github.com/CLUEbenchmark/CLUE#6-csl-%E8%AE%BA%E6%96%87%E5%85%B3%E9%94%AE%E8%AF%8D%E8%AF%86%E5%88%AB-keyword-recognition)上的比较。由于仅有一个数据集且数据集对于以上算法都很不友好，表现仅供参考。
+
+| 算法 | P@5 | R@5 | F@5 |
+| --- | --- | --- | --- |
+| textrank4zh | 0.0836 | 0.1174 | 0.0977 |
+| ht_textrank | 0.0955 | 0.1342 | 0.1116 |
+| ht_jieba_tfidf | **0.1035** | **0.1453** | **0.1209** |
+
 
 <a id="内置资源"> </a>
 
@@ -486,9 +517,11 @@ Text summarization(避免重复)
 现在本库内集成了一些资源，方便使用和建立demo。
 
 资源包括：
-- 褒贬义词典 清华大学 李军 整理自http://nlp.csai.tsinghua.edu.cn/site2/index.php/13-sms
-- 百度停用词词典 来自网络：https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html
-- 领域词典 来自清华THUNLP： http://thuocl.thunlp.org/ 全部类型`['IT', '动物', '医药', '历史人名', '地名', '成语', '法律', '财经', '食物']`
+- `get_qh_sent_dict`: 褒贬义词典 清华大学 李军 整理自http://nlp.csai.tsinghua.edu.cn/site2/index.php/13-sms
+- `get_baidu_stopwords`: 百度停用词词典 来自网络：https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html
+- `get_qh_typed_words`: 领域词典 来自清华THUNLP： http://thuocl.thunlp.org/ 全部类型`['IT', '动物', '医药', '历史人名', '地名', '成语', '法律', '财经', '食物']`
+- `get_english_senti_lexicon`: 英语情感词典
+- `get_jieba_dict`: （需要下载）jieba词频词典
 
 
 此外，还提供了一个特殊资源——《三国演义》，包括：
@@ -590,6 +623,21 @@ min_aggregation = np.sqrt(length) / 15
 </details>
 <br/>
 
+<details><summary>使用结巴词典过滤旧词（展开查看）</summary>
+```
+from harvesttext.resources import get_jieba_dict
+jieba_dict = get_jieba_dict(min_freq=100)
+print("jiaba词典中的词频>100的词语数：", len(jieba_dict))
+text = "1979-1998-2020的喜宝们 我现在记忆不太好，大概是拍戏时摔坏了~有什么笔记都要当下写下来。前几天翻看，找着了当时记下的话.我觉得喜宝既不娱乐也不启示,但这就是生活就是人生,10/16来看喜宝吧"
+new_words_info = ht.word_discover(text, 
+                                    excluding_words=set(jieba_dict),       # 排除词典已有词语
+                                    exclude_number=True)                   # 排除数字（默认True）     
+new_words = new_words_info.index.tolist()
+print(new_words)                                                         # ['喜宝']
+```
+</details>
+<br/>
+
 [根据反馈更新](https://github.com/blmoistawinde/HarvestText/issues/13#issue-551894838) 原本默认接受一个单独的字符串，现在也可以接受字符串列表输入，会自动进行拼接
 
 [根据反馈更新](https://github.com/blmoistawinde/HarvestText/issues/14#issuecomment-576081430) 现在默认按照词频降序排序，也可以传入`sort_by='score'`参数，按照综合质量评分排序。
@@ -802,3 +850,5 @@ we imagine what we'll find, in another life.
 
 [EventTriplesExtraction](https://github.com/liuhuanyong/EventTriplesExtraction)
 
+[textrank4ZH](https://github.com/letiantian/TextRank4ZH)
+
diff --git a/examples/basics.py b/examples/basics.py
index ab538e0..cec6774 100644
--- a/examples/basics.py
+++ b/examples/basics.py
@@ -1,6 +1,7 @@
 #coding=utf-8
 import re
 from harvesttext import HarvestText
+
 ht = HarvestText()
 
 def new_word_discover():
@@ -398,29 +399,80 @@ def test_english():
     # for sent0 in sentences:
     #     print(sent0, ht_eng.analyse_sent(sent0))
 
-
+def jieba_dict_new_word():
+    from harvesttext.resources import get_jieba_dict
+    jieba_dict = get_jieba_dict(min_freq=100)
+    print("jiaba词典中的词频>100的词语数：", len(jieba_dict))
+    text = "1979-1998-2020的喜宝们 我现在记忆不太好，大概是拍戏时摔坏了~有什么笔记都要当下写下来。前几天翻看，找着了当时记下的话.我觉得喜宝既不娱乐也不启示,但这就是生活就是人生,10/16来看喜宝吧"
+    new_words_info = ht.word_discover(text, 
+                                      excluding_words=set(jieba_dict),       # 排除词典已有词语
+                                      exclude_number=True)                   # 排除数字（默认True）     
+    new_words = new_words_info.index.tolist()
+    print(new_words)                                                         # ['喜宝']
+    
+def extract_keywords():
+    text = """
+好好爱自己 就有人会爱你
+这乐观的说词
+幸福的样子 我感觉好真实
+找不到形容词
+沉默在掩饰 快泛滥的激情
+只剩下语助词
+有一种踏实 当你口中喊我名字
+落叶的位置 谱出一首诗
+时间在消逝 我们的故事开始
+这是第一次
+让我见识爱情 可以慷慨又自私
+你是我的关键词
+我不太确定 爱最好的方式
+是动词或名词
+很想告诉你 最赤裸的感情
+却又忘词
+聚散总有时 而哭笑也有时
+我不怕潜台词
+有一种踏实 是你心中有我名字
+落叶的位置 谱出一首诗
+时间在消逝 我们的故事开始
+这是第一次
+让我见识爱情 可以慷慨又自私
+你是我的关键词
+你藏在歌词 代表的意思
+是专有名词
+落叶的位置 谱出一首诗
+我们的故事 才正要开始
+这是第一次
+爱一个人爱得 如此慷慨又自私
+你是我的关键
+    """
+    print("《关键词》里的关键词")
+    kwds = ht.extract_keywords(text, 5, method="jieba_tfidf")
+    print("jieba_tfidf", kwds)
+    kwds = ht.extract_keywords(text, 5, method="textrank")
+    print("textrank", kwds)
 
 if __name__ == "__main__":
-    test_english()
-    new_word_discover()
-    new_word_register()
-    entity_segmentation()
-    sentiment_dict()
-    sentiment_dict_default()
-    entity_search()
-    text_summarization()
-    entity_network()
-    save_load_clear()
-    load_resources()
-    linking_strategy()
-    find_with_rules()
-    load_resources()
-    using_typed_words()
-    build_word_ego_graph()
-    entity_error_check()
-    depend_parse()
-    named_entity_recognition()
-    el_keep_all()
-    filter_el_with_rule()
-    clean_text()
-    cut_paragraph()
+    # test_english()
+    # new_word_discover()
+    # new_word_register()
+    # entity_segmentation()
+    # sentiment_dict()
+    # sentiment_dict_default()
+    # entity_search()
+    # text_summarization()
+    # entity_network()
+    # save_load_clear()
+    # load_resources()
+    # linking_strategy()
+    # find_with_rules()
+    # load_resources()
+    # using_typed_words()
+    # build_word_ego_graph()
+    # entity_error_check()
+    # depend_parse()
+    # named_entity_recognition()
+    # el_keep_all()
+    # filter_el_with_rule()
+    # clean_text()
+    # cut_paragraph()
+    # jieba_dict_new_word()
+    extract_keywords()
diff --git a/examples/kwd_benchmark/CSL.ipynb b/examples/kwd_benchmark/CSL.ipynb
new file mode 100644
index 0000000..9c0bb0a
--- /dev/null
+++ b/examples/kwd_benchmark/CSL.ipynb
@@ -0,0 +1,523 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": 3
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python_defaultSpec_1602139106579",
+   "display_name": "Python 3.6.9 64-bit ('py36': conda)"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "source": [
+    "# HarvestText中的关键词算法benchmark"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import networkx as nx\n",
+    "from tqdm import tqdm\n",
+    "import jieba\n",
+    "from collections import defaultdict\n",
+    "from harvesttext import HarvestText"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ht = HarvestText()"
+   ]
+  },
+  {
+   "source": [
+    "首先，选取的数据集是CLUE整理的CSL关键词预测数据集（[下载地址](https://github.com/CLUEbenchmark/CLUE#6-csl-%E8%AE%BA%E6%96%87%E5%85%B3%E9%94%AE%E8%AF%8D%E8%AF%86%E5%88%AB-keyword-recognition)）。需要先下载并放到本目录的`CSL关键词预测`文件夹下\n",
+    "\n",
+    "在上面先在开发集上做一些基本的分析及调参。"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "3000"
+     },
+     "metadata": {},
+     "execution_count": 9
+    }
+   ],
+   "source": [
+    "data_dev = []\n",
+    "with open('CSL关键词预测/dev.json', encoding='utf-8') as f:\n",
+    "    for line in f:\n",
+    "        tmp = json.loads(line)\n",
+    "        data_dev.append((tmp['abst'], tmp['keyword']))\n",
+    "len(data_dev)"
+   ]
+  },
+  {
+   "source": [
+    "一些基础的数据探索性分析（EDA）\n",
+    "- 每个文档的关键词个数\n",
+    "- 关键词的长度分布\n",
+    "- 考察分词`seg`的情况和不分词`nseg`的情况，有多少比例的关键词被覆盖。这决定了依赖分词和不依赖分词的算法所能达到的理论recall上限。"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "all_keywords = 0\n",
+    "recalls = {'seg':0, 'nseg':0}\n",
+    "kwd_cnt = defaultdict(int)\n",
+    "kwd_len_cnt = defaultdict(int)\n",
+    "for abst, kwds in data_dev:\n",
+    "    kwd_cnt[len(kwds)] += 1\n",
+    "    words = set(jieba.lcut(abst))\n",
+    "    all_keywords += len(kwds)\n",
+    "    recalls['seg'] += len(set(kwds) & words)\n",
+    "    recalls['nseg'] += sum(int(kwd in abst) for kwd in kwds)\n",
+    "    for kwd in kwds:\n",
+    "        kwd_len_cnt[len(kwd)] += 1\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "defaultdict(<class 'int'>, {4: 1814, 3: 1128, 2: 58})\n"
+    }
+   ],
+   "source": [
+    "print(kwd_cnt)"
+   ]
+  },
+  {
+   "source": [
+    "每篇文档的关键词数量在2-4之间"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "1     0.004277\n2     0.260134\n3     0.387970\n4     0.702864\n5     0.812756\n6     0.904239\n7     0.937151\n8     0.956489\n9     0.971551\n10    0.980104\n11    0.988100\n12    0.991633\n13    0.995258\n14    0.995816\n15    0.996281\n16    0.997583\n17    0.998791\n18    0.999256\n19    0.999442\n20    0.999907\n31    1.000000\ndtype: float64"
+     },
+     "metadata": {},
+     "execution_count": 14
+    }
+   ],
+   "source": [
+    "# 关键词长度的累积概率分布\n",
+    "pd.Series(kwd_len_cnt).sort_index().cumsum() / sum(kwd_len_cnt.values())"
+   ]
+  },
+  {
+   "source": [
+    "存在很长的关键词，以一个词而不是多词词组为单元的关键词算法无法处理这些情况，不过4个字以内也已经可以覆盖70%"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "{'seg': 0.3697471178876906, 'nseg': 0.7791000371885459}\n"
+    }
+   ],
+   "source": [
+    "for k in recalls:\n",
+    "    recalls[k] /= all_keywords\n",
+    "print(recalls)"
+   ]
+  },
+  {
+   "source": [
+    "上述情况说明，依赖jieba分词的算法在这个数据集上最多只能达到36.97%的recall，而其他从原文直接中抽取方法（新词发现，序列标注等）有可能达到77.91%。\n",
+    "\n",
+    "下面的算法，因此在数值上不会有很好的表现，不过依旧可以为比较和调参提供一些参考。"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "source": [
+    "给出一个关键词抽取的示例，包括`textrank`和HarvestText封装jieba并配置好参数和停用词的`jieba_tfidf`。"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "随机噪声雷达通常利用时域相关完成脉冲压缩从而进行目标检测.该文根据压缩感知理论提出一种适用于噪声雷达目标检测的新算法,它用低维投影测量和信号重建取代了传统的相关操作和压缩处理,将大量运算转移到后期处理.该算法以噪声雷达所检测的目标空间分布满足稀疏性为前提；利用发射信号形成卷积矩阵,然后通过随机抽取卷积矩阵的行构建测量矩阵；并采用迭代收缩阈值算法实现目标信号重建.该文对算法作了详细的理论推导,形成完整的实现框架.仿真实验验证了算法的有效性,并分析了对处理结果影响较大的因素.该算法能够有效地重建目标,具有良好的运算效率.与时域相关法相比,大幅度减小了目标检测误差,有效抑制了输出旁瓣,并保持了信号的相位特性.\n真实关键词：['目标', '相关', '矩阵']\njieba_tfidf 关键词(前5)：['算法', '矩阵', '检测', '目标', '信号']\ntextrank 关键词(前5)：['算法', '信号', '目标', '压缩', '矩阵']\n"
+    }
+   ],
+   "source": [
+    "text, kwds = data_dev[10]\n",
+    "print(text)\n",
+    "print(\"真实关键词：\", kwds)\n",
+    "print(\"jieba_tfidf 关键词(前5)：\", ht.extract_keywords(text, 5, method=\"jieba_tfidf\"))\n",
+    "print(\"textrank 关键词(前5)：\", ht.extract_keywords(text, 5, method=\"textrank\"))"
+   ]
+  },
+  {
+   "source": [
+    "每篇文章取前5个作为预测值，我们可以得到precision@5, recall@5, F1@5来评估算法的效果"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_keywords = 0\n",
+    "pred_keywords = 0\n",
+    "recall_new_word = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": "100%|██████████| 3000/3000 [00:29<00:00, 100.76it/s]\njieba Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\n"
+    }
+   ],
+   "source": [
+    "topK = 5\n",
+    "ref_keywords, pred_keywords = 0, 0\n",
+    "acc_count = 0\n",
+    "for text, kwds in tqdm(data_dev):\n",
+    "    ref_keywords += len(kwds)\n",
+    "    pred_keywords += topK\n",
+    "    preds = ht.extract_keywords(text, topK, method=\"jieba_tfidf\")\n",
+    "    acc_count += len(set(kwds) & set(preds))\n",
+    "prec = acc_count / pred_keywords\n",
+    "recall = acc_count / ref_keywords\n",
+    "f1 = 2*prec*recall/(prec+recall)\n",
+    "print(f\"jieba Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "jieba Precison:0.1060, Recall:0.1478, F1:0.1235\n"
+    }
+   ],
+   "source": [
+    "print(f\"jieba Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")"
+   ]
+  },
+  {
+   "source": [
+    "Textrank调参"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": "100%|██████████| 3000/3000 [00:45<00:00, 66.11it/s]\ntextrank[block: doc, window:2, weighted:False] Precison:0.0942, Recall:0.1314, F1:0.1097\n100%|██████████| 3000/3000 [00:46<00:00, 64.20it/s]\ntextrank[block: doc, window:2, weighted:True] Precison:0.0955, Recall:0.1332, F1:0.1113\n100%|██████████| 3000/3000 [00:41<00:00, 71.53it/s]\ntextrank[block: doc, window:3, weighted:False] Precison:0.0948, Recall:0.1322, F1:0.1104\n100%|██████████| 3000/3000 [00:41<00:00, 65.70it/s]\ntextrank[block: doc, window:3, weighted:True] Precison:0.0945, Recall:0.1318, F1:0.1101\n100%|██████████| 3000/3000 [00:41<00:00, 72.11it/s]\ntextrank[block: doc, window:4, weighted:False] Precison:0.0944, Recall:0.1316, F1:0.1100\n100%|██████████| 3000/3000 [00:41<00:00, 71.65it/s]\ntextrank[block: doc, window:4, weighted:True] Precison:0.0939, Recall:0.1309, F1:0.1093\n100%|██████████| 3000/3000 [00:45<00:00, 66.37it/s]\ntextrank[block: sent, window:2, weighted:False] Precison:0.0931, Recall:0.1299, F1:0.1085\n100%|██████████| 3000/3000 [00:45<00:00, 65.93it/s]\ntextrank[block: sent, window:2, weighted:True] Precison:0.0945, Recall:0.1318, F1:0.1101\n100%|██████████| 3000/3000 [00:41<00:00, 53.28it/s]\ntextrank[block: sent, window:3, weighted:False] Precison:0.0936, Recall:0.1305, F1:0.1090\n100%|██████████| 3000/3000 [00:40<00:00, 73.21it/s]\ntextrank[block: sent, window:3, weighted:True] Precison:0.0929, Recall:0.1295, F1:0.1082\n100%|██████████| 3000/3000 [00:40<00:00, 73.50it/s]\ntextrank[block: sent, window:4, weighted:False] Precison:0.0931, Recall:0.1298, F1:0.1084\n100%|██████████| 3000/3000 [00:41<00:00, 72.45it/s]\ntextrank[block: sent, window:4, weighted:True] Precison:0.0925, Recall:0.1290, F1:0.1077\n"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "topK = 5\n",
+    "block_types = [\"doc\", \"sent\"]\n",
+    "window_sizes = [2, 3, 4]\n",
+    "if_weighted = [False, True]\n",
+    "for block_type, window, weighted in product(block_types, window_sizes, if_weighted):\n",
+    "    ref_keywords, pred_keywords = 0, 0\n",
+    "    acc_count = 0\n",
+    "    for text, kwds in tqdm(data_dev):\n",
+    "        ref_keywords += len(kwds)\n",
+    "        pred_keywords += topK\n",
+    "        preds = ht.extract_keywords(text, topK, method=\"textrank\", block_type=block_type, window=window, weighted=weighted)\n",
+    "        acc_count += len(set(kwds) & set(preds))\n",
+    "    prec = acc_count / pred_keywords\n",
+    "    recall = acc_count / ref_keywords\n",
+    "    f1 = 2*prec*recall/(prec+recall)\n",
+    "    print(f\"textrank[block: {block_type}, window:{window}, weighted:{weighted}] Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")"
+   ]
+  },
+  {
+   "source": [
+    "textrank的最佳参数是 block: doc, window:2, weighted:True\n",
+    "\n",
+    "precision和recall与jieba_tfidf还是有差距，可能是因为后者拥有从大量语料库中统计得到的idf数据能起到一定帮助"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "source": [
+    "## 测试集benchmark\n",
+    "\n",
+    "选取各个算法的最佳参数在测试集上获得最终表现"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "3000"
+     },
+     "metadata": {},
+     "execution_count": 22
+    }
+   ],
+   "source": [
+    "data_test = []\n",
+    "with open('CSL关键词预测/test.json', encoding='utf-8') as f:\n",
+    "    for line in f:\n",
+    "        tmp = json.loads(line)\n",
+    "        data_test.append((tmp['abst'], tmp['keyword']))\n",
+    "len(data_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": "100%|██████████| 3000/3000 [00:30<00:00, 99.11it/s]\njieba Precison:0.1035, Recall:0.1453, F1:0.1209\n"
+    }
+   ],
+   "source": [
+    "topK = 5\n",
+    "ref_keywords, pred_keywords = 0, 0\n",
+    "acc_count = 0\n",
+    "for text, kwds in tqdm(data_test):\n",
+    "    ref_keywords += len(kwds)\n",
+    "    pred_keywords += topK\n",
+    "    preds = ht.extract_keywords(text, topK, method=\"jieba_tfidf\")\n",
+    "    acc_count += len(set(kwds) & set(preds))\n",
+    "prec = acc_count / pred_keywords\n",
+    "recall = acc_count / ref_keywords\n",
+    "f1 = 2*prec*recall/(prec+recall)\n",
+    "print(f\"jieba Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": "100%|██████████| 3000/3000 [00:45<00:00, 65.51it/s]\ntextrank Precison:0.0955, Recall:0.1342, F1:0.1116\n"
+    }
+   ],
+   "source": [
+    "topK = 5\n",
+    "ref_keywords, pred_keywords = 0, 0\n",
+    "acc_count = 0\n",
+    "for text, kwds in tqdm(data_test):\n",
+    "    ref_keywords += len(kwds)\n",
+    "    pred_keywords += topK\n",
+    "    preds = ht.extract_keywords(text, topK, method=\"textrank\", block_size=\"doc\", window=2, weighted=True)\n",
+    "    acc_count += len(set(kwds) & set(preds))\n",
+    "prec = acc_count / pred_keywords\n",
+    "recall = acc_count / ref_keywords\n",
+    "f1 = 2*prec*recall/(prec+recall)\n",
+    "print(f\"textrank Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")"
+   ]
+  },
+  {
+   "source": [
+    "另，附上HarvestText与另一个流行的textrank的实现，[textrank4zh](https://github.com/letiantian/TextRank4ZH)的比较"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "随机噪声雷达通常利用时域相关完成脉冲压缩从而进行目标检测.该文根据压缩感知理论提出一种适用于噪声雷达目标检测的新算法,它用低维投影测量和信号重建取代了传统的相关操作和压缩处理,将大量运算转移到后期处理.该算法以噪声雷达所检测的目标空间分布满足稀疏性为前提；利用发射信号形成卷积矩阵,然后通过随机抽取卷积矩阵的行构建测量矩阵；并采用迭代收缩阈值算法实现目标信号重建.该文对算法作了详细的理论推导,形成完整的实现框架.仿真实验验证了算法的有效性,并分析了对处理结果影响较大的因素.该算法能够有效地重建目标,具有良好的运算效率.与时域相关法相比,大幅度减小了目标检测误差,有效抑制了输出旁瓣,并保持了信号的相位特性.\n真实关键词：['目标', '相关', '矩阵']\ntextrank4zh 关键词(前5)：['算法', '信号', '目标', '压缩', '运算']\n"
+    }
+   ],
+   "source": [
+    "from textrank4zh import TextRank4Keyword\n",
+    "\n",
+    "def textrank4zh(text, topK, window=2):\n",
+    "    # same as used in ht\n",
+    "    allowPOS = {'n', 'ns', 'nr', 'nt', 'nz', 'vn', 'v', 'an', 'a', 'i'}\n",
+    "    tr4w = TextRank4Keyword(allow_speech_tags=allowPOS)\n",
+    "    tr4w.analyze(text=text, lower=True, window=window)\n",
+    "    return [item.word for item in tr4w.get_keywords(topK)]\n",
+    "\n",
+    "text, kwds = data_dev[10]\n",
+    "print(text)\n",
+    "print(\"真实关键词：\", kwds)\n",
+    "print(\"textrank4zh 关键词(前5)：\", textrank4zh(text, 5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": "100%|██████████| 3000/3000 [02:12<00:00, 24.17it/s]\ntextrank4zh Precison:0.0836, Recall:0.1174, F1:0.0977\n"
+    }
+   ],
+   "source": [
+    "topK = 5\n",
+    "ref_keywords, pred_keywords = 0, 0\n",
+    "acc_count = 0\n",
+    "for text, kwds in tqdm(data_test):\n",
+    "    ref_keywords += len(kwds)\n",
+    "    pred_keywords += topK\n",
+    "    preds = textrank4zh(text, topK)\n",
+    "    acc_count += len(set(kwds) & set(preds))\n",
+    "prec = acc_count / pred_keywords\n",
+    "recall = acc_count / ref_keywords\n",
+    "f1 = 2*prec*recall/(prec+recall)\n",
+    "print(f\"textrank4zh Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")"
+   ]
+  },
+  {
+   "source": [
+    "HarvestText的textrank的实现在精度和速度上都有一定的优势。"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "source": [
+    "总结各个算法在CSL数据及上的结果：\n",
+    "\n",
+    "| 算法 | P@5 | R@5 | F@5 |\n",
+    "| --- | --- | --- | --- |\n",
+    "| textrank4zh | 0.0836 | 0.1174 | 0.0977 |\n",
+    "| ht_textrank | 0.0955 | 0.1342 | 0.1116 |\n",
+    "| ht_jieba_tfidf | **0.1035** | **0.1453** | **0.1209** |\n",
+    "\n",
+    "综上，HarvestText的关键词抽取功能\n",
+    "- 把配置好参数的jieba_tfidf作为默认方法\n",
+    "- 使用自己的textrank实现而不是用流行的textrank4zh。"
+   ],
+   "cell_type": "markdown",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ]
+}
\ No newline at end of file
diff --git a/harvesttext/algorithms/keyword.py b/harvesttext/algorithms/keyword.py
new file mode 100644
index 0000000..0db95c3
--- /dev/null
+++ b/harvesttext/algorithms/keyword.py
@@ -0,0 +1,36 @@
+import numpy as np
+import networkx as nx
+
+def combine(word_list, window = 2):
+    """构造在window下的单词组合，用来构造单词之间的边。
+    
+    :params word_list: list of str, 由单词组成的列表。
+    :params window: int, 窗口大小。
+    """
+    if window < 2: window = 2
+    for x in range(1, window):
+        if x >= len(word_list):
+            break
+        word_list2 = word_list[x:]
+        res = zip(word_list, word_list2)
+        for r in res:
+            yield r
+
+def textrank(block_words, topK, with_score=False, window=2, weighted=False):
+    G = nx.Graph()
+    for word_list in block_words:
+        for u, v in combine(word_list, window):
+            if not weighted:
+                G.add_edge(u, v)
+            else:
+                if G.has_edge(u, v):
+                    G[u][v]['weight'] += 1
+                else:
+                    G.add_edge(u, v, weight=1)
+
+    pr = nx.pagerank_scipy(G)
+    pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True)
+    if with_score:
+        return pr_sorted[:topK]
+    else:
+        return [w for (w, imp) in pr_sorted[:topK]]
\ No newline at end of file
diff --git a/harvesttext/algorithms/word_discoverer.py b/harvesttext/algorithms/word_discoverer.py
index 7283856..b0376cc 100644
--- a/harvesttext/algorithms/word_discoverer.py
+++ b/harvesttext/algorithms/word_discoverer.py
@@ -202,11 +202,13 @@ def genWords2(self, doc):
             v.left = entropyOfList(v.left)
             v.right = entropyOfList(v.right)
         return values
-    def get_df_info(self, ex_mentions):
+    def get_df_info(self, ex_mentions, exclude_number=True):
         info = {"text":[],"freq":[],"left_ent":[],"right_ent":[],"agg":[]}
         for w in self.word_infos:
             if w.text in ex_mentions:
                 continue
+            if exclude_number and w.text.isdigit():
+                continue
             info["text"].append(w.text)
             info["freq"].append(w.freq)
             info["left_ent"].append(w.left)
diff --git a/harvesttext/download_utils.py b/harvesttext/download_utils.py
new file mode 100644
index 0000000..e0360ac
--- /dev/null
+++ b/harvesttext/download_utils.py
@@ -0,0 +1,135 @@
+import os
+import shutil
+import requests
+import hashlib
+from tqdm import tqdm
+from collections import namedtuple
+from os import environ, listdir, makedirs
+from os.path import dirname, exists, expanduser, isdir, join, splitext
+
+RemoteFileMetadata = namedtuple('RemoteFileMetadata',
+                                ['filename', 'url', 'checksum'])
+
+# config according to computer, this should be default setting of shadowsocks
+DEFAULT_PROXIES = {
+    'http': 'socks5h://127.0.0.1:1080',
+    'https': 'socks5h://127.0.0.1:1080'
+}
+
+def get_data_home(data_home=None):
+    """Return the path of the scikit-learn data dir.
+    This folder is used by some large dataset loaders to avoid downloading the
+    data several times.
+    By default the data dir is set to a folder named 'scikit_learn_data' in the
+    user home folder.
+    Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment
+    variable or programmatically by giving an explicit folder path. The '~'
+    symbol is expanded to the user home folder.
+    If the folder does not already exist, it is automatically created.
+    Parameters
+    ----------
+    data_home : str | None
+        The path to data dir.
+    """
+    if data_home is None:
+        data_home = environ.get('HARVESTTEXT_DATA',
+                                join('~', '.harvesttext'))
+    data_home = expanduser(data_home)
+    if not exists(data_home):
+        makedirs(data_home)
+    return data_home
+
+def clear_data_home(data_home=None):
+    """Delete all the content of the data home cache.
+    Parameters
+    ----------
+    data_home : str | None
+        The path to data dir.
+    """
+    data_home = get_data_home(data_home)
+    shutil.rmtree(data_home)
+
+def _sha256(path):
+    """Calculate the sha256 hash of the file at path."""
+    sha256hash = hashlib.sha256()
+    chunk_size = 8192
+    with open(path, "rb") as f:
+        while True:
+            buffer = f.read(chunk_size)
+            if not buffer:
+                break
+            sha256hash.update(buffer)
+    return sha256hash.hexdigest()
+
+def _download_with_bar(url, file_path, proxies=DEFAULT_PROXIES):
+    # Streaming, so we can iterate over the response.
+    response = requests.get(url, stream=True, proxies=proxies)
+    total_size_in_bytes= int(response.headers.get('content-length', 0))
+    block_size = 1024    # 1 KB
+    progress_bar = tqdm(total=total_size_in_bytes, unit='B', unit_scale=True)
+    with open(file_path, 'wb') as file:
+        for data in response.iter_content(block_size):
+            progress_bar.update(len(data))
+            file.write(data)
+    progress_bar.close()
+    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+        raise Exception("ERROR, something went wrong with the downloading")
+    return file_path
+
+def _fetch_remote(remote, dirname=None, use_proxy=False, proxies=DEFAULT_PROXIES):
+    """Helper function to download a remote dataset into path
+    Fetch a dataset pointed by remote's url, save into path using remote's
+    filename and ensure its integrity based on the SHA256 Checksum of the
+    downloaded file.
+    Parameters
+    ----------
+    remote : RemoteFileMetadata
+        Named tuple containing remote dataset meta information: url, filename
+        and checksum
+    dirname : string
+        Directory to save the file to.
+    Returns
+    -------
+    file_path: string
+        Full path of the created file.
+    """
+
+    file_path = (remote.filename if dirname is None
+                 else join(dirname, remote.filename))
+    proxies = None if not use_proxy else proxies
+    file_path = _download_with_bar(remote.url, file_path, proxies)
+    checksum = _sha256(file_path)
+    if remote.checksum != checksum:
+        raise IOError("{} has an SHA256 checksum ({}) "
+                      "differing from expected ({}), "
+                      "file may be corrupted.".format(file_path, checksum,
+                                                      remote.checksum))
+    return file_path
+
+
+def download(remote, file_path=None, use_proxy=False, proxies=DEFAULT_PROXIES):
+    data_home = get_data_home()
+    file_path = _fetch_remote(remote, data_home, use_proxy, proxies)
+    return file_path
+
+def check_download_resource(remote, use_proxy=False, proxies=None):
+    proxies = DEFAULT_PROXIES if use_proxy and proxies is None else proxies
+    data_home = get_data_home()
+    file_path = os.path.join(data_home, remote.filename)
+    if not os.path.exists(file_path):
+        # currently don't capture error at this level, assume download success
+        file_path = download(remote, data_home)
+    return file_path
+
+if __name__ == "__main__":
+    ARCHIVE = RemoteFileMetadata(
+        filename='harvesttext-0.7.2-py3-none-any.whl',
+        url='https://github.com/blmoistawinde/HarvestText/releases/download/V0.7.2/harvesttext-0.7.2-py3-none-any.whl',
+        checksum='004c8b0b1858f69025f721bc84cff33127d53c6ab526beed7a7a801a9c21f30b')
+    print("Download")
+    file_path = download(ARCHIVE)
+    print(file_path)
+    # if proxy is available
+    # print("Download using proxy")
+    # file_path = download(ARCHIVE, use_proxy=True)
+    # print(file_path)
\ No newline at end of file
diff --git a/harvesttext/harvesttext.py b/harvesttext/harvesttext.py
index 5698414..2579969 100644
--- a/harvesttext/harvesttext.py
+++ b/harvesttext/harvesttext.py
@@ -53,6 +53,7 @@ def __init__(self, standard_name=False, language='zh_CN'):
             self.pinyin_adjlist = json.load(f)
         self.language = language
         if language == "en":
+            import nltk
             try:
                 nltk.data.find('taggers/averaged_perceptron_tagger')
             except:
@@ -774,7 +775,7 @@ def clean_text(self, text, remove_url=True, email=True, weibo_at=True, stop_term
         if t2s:
             cc = OpenCC('t2s')
             text = cc.convert(text)
-        assert hasattr(stop_terms, "__init__"), Exception("去除的词语必须是一个可迭代对象")
+        assert hasattr(stop_terms, "__iter__"), Exception("去除的词语必须是一个可迭代对象")
         if type(stop_terms) == str:
             text = text.replace(stop_terms, "")
         else:
diff --git a/harvesttext/parsing.py b/harvesttext/parsing.py
index 63c7fce..171be03 100644
--- a/harvesttext/parsing.py
+++ b/harvesttext/parsing.py
@@ -139,7 +139,7 @@ def cut_paragraphs(self, text, num_paras=None, block_sents=3, std_weight=0.5,
         if num_paras is not None:
             assert num_paras > 0, "Should give a positive number of num_paras"
         assert stopwords == 'baidu' or (hasattr(stopwords, '__iter__') and type(stopwords) != str)
-        stopwords = get_baidu_stopwords() if stopwords == 'baidu' else stopwords
+        stopwords = get_baidu_stopwords() if stopwords == 'baidu' else set(stopwords)
         if seq_chars < 1:
             cut_seqs = lambda x: self.cut_sentences(x, **kwargs)
         else:
diff --git a/harvesttext/resources.py b/harvesttext/resources.py
index 623329b..84ff3de 100644
--- a/harvesttext/resources.py
+++ b/harvesttext/resources.py
@@ -10,6 +10,7 @@
 # 李军 中文评论的褒贬义分类实验研究 硕士论文 清华大学 2008
 import os
 import json
+from collections import defaultdict
 
 def get_qh_sent_dict():
     """
@@ -123,4 +124,31 @@ def get_english_senti_lexicon(type="LH"):
         senti_lexicon = json.load(f)
     return senti_lexicon
 
-
+def get_jieba_dict(min_freq=0, max_freq=float('inf'), with_pos=False, use_proxy=False, proxies=None):
+    """
+    获得jieba自带的中文词语词频词典
+    
+    :params min_freq: 选取词语需要的最小词频
+    :params max_freq: 选取词语允许的最大词频
+    :params with_pos: 返回结果是否包括词性信息
+    :return if not with_pos, dict of {wd: freq}, else, dict of {(wd, pos): freq} 
+    """
+    from .download_utils import RemoteFileMetadata, check_download_resource
+    remote = RemoteFileMetadata(
+        filename='jieba_dict.txt',
+        url='https://github.com/blmoistawinde/HarvestText/releases/download/V0.8/jieba_dict.txt',
+        checksum='7197c3211ddd98962b036cdf40324d1ea2bfaa12bd028e68faa70111a88e12a8')
+    file_path = check_download_resource(remote, use_proxy, proxies)
+    ret = defaultdict(int)
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            if len(line.strip().split()) == 3:
+                wd, freq, pos = line.strip().split()
+                freq = int(freq)
+            if freq > min_freq and freq < max_freq:
+                if not with_pos:
+                    ret[wd] = freq
+                else:
+                    ret[(wd, pos)] = freq
+    return ret
+        
\ No newline at end of file
diff --git a/harvesttext/word_discover.py b/harvesttext/word_discover.py
index b302ec4..087e57d 100644
--- a/harvesttext/word_discover.py
+++ b/harvesttext/word_discover.py
@@ -1,4 +1,7 @@
+import jieba
+import jieba.analyse
 import logging
+import networkx as nx
 import numpy as np
 import pandas as pd
 from collections import defaultdict
@@ -6,6 +9,7 @@
 from .resources import get_baidu_stopwords
 from .algorithms.word_discoverer import WordDiscoverer
 from .algorithms.entity_discoverer import NFLEntityDiscoverer, NERPEntityDiscover
+from .algorithms.keyword import textrank
 
 class WordDiscoverMixin:
     """
@@ -18,7 +22,7 @@ class WordDiscoverMixin:
     def word_discover(self, doc, threshold_seeds=[], auto_param=True,
                       excluding_types=[], excluding_words='baidu_stopwords',  # 可以排除已经登录的某些种类的实体，或者某些指定词
                       max_word_len=5, min_freq=0.00005, min_entropy=1.4, min_aggregation=50,
-                      ent_threshold="both", mem_saving=None, sort_by='freq'):
+                      ent_threshold="both", mem_saving=None, sort_by='freq', exclude_number=True):
         '''新词发现，基于 http://www.matrix67.com/blog/archives/5044 实现及微调
 
         :param doc: (string or list) 待进行新词发现的语料，如果是列表的话，就会自动用换行符拼接
@@ -33,6 +37,7 @@ def word_discover(self, doc, threshold_seeds=[], auto_param=True,
         :param ent_threshold: "both": (默认)在使用左右交叉熵进行筛选时，两侧都必须超过阈值; "avg": 两侧的平均值达到阈值即可
         :param mem_saving: bool or None, 采用一些过滤手段来减少内存使用，但可能影响速度。如果不指定，对长文本自动打开，而对短文本不使用
         :param sort_by: 以下string之一: {'freq': 词频, 'score': 综合分数, 'agg':凝聚度} 按照特定指标对得到的词语信息排序，默认使用词频
+        :param exclude_number: （默认True）过滤发现的纯数字新词
         :return: info: 包含新词作为index, 以及对应各项指标的DataFrame
         '''
         if type(doc) != str:
@@ -72,7 +77,7 @@ def word_discover(self, doc, threshold_seeds=[], auto_param=True,
         else:
             ex_mentions |= set(excluding_words)
 
-        info = ws.get_df_info(ex_mentions)
+        info = ws.get_df_info(ex_mentions, exclude_number)
 
         # 利用种子词来确定筛选优质新词的标准，种子词中最低质量的词语将被保留（如果一开始就被找到的话）
         if len(threshold_seeds) > 0:
@@ -234,4 +239,66 @@ def entity_discover(self, text, return_count=False, method="NFL", min_count=5, p
             return entity_mention_dict, entity_type_dict, mention_count
         else:
             return entity_mention_dict, entity_type_dict
+    
+    def extract_keywords(self, text, topK, with_score=False, min_word_len=2, stopwords="baidu", allowPOS="default", method="jieba_tfidf", **kwargs):
+        """用各种算法抽取关键词（目前均为无监督），结合了ht的实体分词来提高准确率
 
+        目前支持的算法类型（及额外参数）：
+
+        - jieba_tfidf: （默认）jieba自带的基于tfidf的关键词抽取算法，idf统计信息来自于其语料库
+        - textrank: 基于textrank的关键词抽取算法
+            - block_type: 默认"doc"。 支持三种级别，"sent", "para", "doc"，每个block之间的临近词语不建立连边
+            - window: 默认2, 邻接的几个词语之内建立连边
+            - weighted: 默认False, 时候使用加权图计算textrank
+            - 构建词图时会过滤不符合min_word_len, stopwords, allowPOS要求的词语
+
+        :params text: 从中挖掘关键词的文档
+        :params topK: int, 从每个文档中抽取的关键词（最大）数量
+        :params with_score: bool, 默认False, 是否同时返回算法提供的分数（如果有的话）
+        :params min_word_len: 默认2, 被纳入关键词的词语不低于此长度
+        :param stopwords: 字符串列表/元组/集合，或者'baidu'为默认百度停用词，在算法中引入的停用词，一般能够提升准确度
+        :params allowPOS: iterable of str，关键词应当属于的词性，默认为"default" {'n', 'ns', 'nr', 'nt', 'nz', 'vn', 'v', 'an', 'a', 'i'}以及已登录的实体词类型
+        :params method: 选择用于抽取的算法，目前支持"jieba_tfidf", "tfidf", "textrank"
+        :params kwargs: 其他算法专属参数
+
+
+        """
+        assert method in {"jieba_tfidf", "textrank"}, print("目前不支持的算法")
+        if allowPOS == 'default':
+            # ref: 结巴分词标注兼容_ICTCLAS2008汉语词性标注集 https://www.cnblogs.com/hpuCode/p/4416186.html
+            allowPOS = {'n', 'ns', 'nr', 'nt', 'nz', 'vn', 'v', 'an', 'a', 'i'}
+        else:
+            assert hasattr(allowPOS, "__iter__")
+        # for HT, we consider registered entity types specifically
+        allowPOS |= set(self.type_entity_mention_dict)
+
+        assert stopwords == 'baidu' or (hasattr(stopwords, '__iter__') and type(stopwords) != str)
+        stopwords = get_baidu_stopwords() if stopwords == 'baidu' else set(stopwords)
+        
+        if method == "jieba_tfidf":
+            kwds = jieba.analyse.extract_tags(text, topK=int(2*topK), allowPOS=allowPOS, withWeight=with_score)
+            if with_score:
+                kwds = [(kwd, score) for (kwd, score) in kwds if kwd not in stopwords][:topK]
+            else:
+                kwds = kwds[:topK]
+        elif method == "textrank":
+            block_type = kwargs.get("block_type", "doc")
+            assert block_type in {"sent", "para", "doc"}
+            window = kwargs.get("window", 2)
+            weighted = kwargs.get("weighted", True)
+            if block_type == "doc":
+                blocks = [text]
+            elif block_type == "para":
+                blocks = [para.strip() for para in text.split("\n") if para.strip() != ""]
+            elif block_type == "sent":
+                blocks = self.cut_sentences(text)
+            block_pos = (self.posseg(block.strip(), stopwords=stopwords) for block in blocks)
+            block_words = [[wd for wd, pos in x 
+                               if pos in allowPOS and len(wd) >= min_word_len] 
+                               for x in block_pos]
+            kwds = textrank(block_words, topK, with_score, window, weighted)
+        
+        return kwds
+
+            
+            
\ No newline at end of file