diff --git a/README.md b/README.md index 7602b6d..2ed63df 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Sow with little data seed, harvest much from a text field. ![GitHub stars](https://img.shields.io/github/stars/blmoistawinde/harvesttext?style=social) ![PyPI - Python Version](https://img.shields.io/badge/python-3.6+-blue.svg) ![GitHub](https://img.shields.io/github/license/mashape/apistatus.svg) -![Version](https://img.shields.io/badge/version-V0.7-red.svg) +![Version](https://img.shields.io/badge/version-V0.8-red.svg) ## 用途 HarvestText是一个专注无(弱)监督方法,能够整合领域知识(如类型,别名)对特定领域文本进行简单高效地处理和分析的库。适用于许多文本预处理和初步探索性分析任务,在小说分析,网络文本,专业文献等领域都有潜在应用价值。 @@ -478,6 +478,37 @@ Text summarization(避免重复) 武磊和郜林,谁是中国最好的前锋? ``` + + +### 关键词抽取 + +目前提供包括`textrank`和HarvestText封装jieba并配置好参数和停用词的`jieba_tfidf`(默认)两种算法。 + +示例(完整见[example](./examples/basics.py)): + +```python3 +# text为林俊杰《关键词》歌词 +print("《关键词》里的关键词") +kwds = ht.extract_keywords(text, 5, method="jieba_tfidf") +print("jieba_tfidf", kwds) +kwds = ht.extract_keywords(text, 5, method="textrank") +print("textrank", kwds) +``` + +``` +《关键词》里的关键词 +jieba_tfidf ['自私', '慷慨', '落叶', '消逝', '故事'] +textrank ['自私', '落叶', '慷慨', '故事', '位置'] +``` + +[CSL.ipynb](./examples/kwd_benchmark/CSL.ipynb)提供了不同算法,以及本库的实现与[textrank4zh](https://github.com/letiantian/TextRank4ZH)的在[CSL数据集](https://github.com/CLUEbenchmark/CLUE#6-csl-%E8%AE%BA%E6%96%87%E5%85%B3%E9%94%AE%E8%AF%8D%E8%AF%86%E5%88%AB-keyword-recognition)上的比较。由于仅有一个数据集且数据集对于以上算法都很不友好,表现仅供参考。 + +| 算法 | P@5 | R@5 | F@5 | +| --- | --- | --- | --- | +| textrank4zh | 0.0836 | 0.1174 | 0.0977 | +| ht_textrank | 0.0955 | 0.1342 | 0.1116 | +| ht_jieba_tfidf | **0.1035** | **0.1453** | **0.1209** | + @@ -486,9 +517,11 @@ Text summarization(避免重复) 现在本库内集成了一些资源,方便使用和建立demo。 资源包括: -- 褒贬义词典 清华大学 李军 整理自http://nlp.csai.tsinghua.edu.cn/site2/index.php/13-sms -- 百度停用词词典 来自网络:https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html -- 领域词典 来自清华THUNLP: http://thuocl.thunlp.org/ 全部类型`['IT', '动物', '医药', '历史人名', '地名', '成语', '法律', '财经', '食物']` +- `get_qh_sent_dict`: 褒贬义词典 清华大学 李军 整理自http://nlp.csai.tsinghua.edu.cn/site2/index.php/13-sms +- `get_baidu_stopwords`: 百度停用词词典 来自网络:https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html +- `get_qh_typed_words`: 领域词典 来自清华THUNLP: http://thuocl.thunlp.org/ 全部类型`['IT', '动物', '医药', '历史人名', '地名', '成语', '法律', '财经', '食物']` +- `get_english_senti_lexicon`: 英语情感词典 +- `get_jieba_dict`: (需要下载)jieba词频词典 此外,还提供了一个特殊资源——《三国演义》,包括: @@ -590,6 +623,21 @@ min_aggregation = np.sqrt(length) / 15
+
使用结巴词典过滤旧词(展开查看) +``` +from harvesttext.resources import get_jieba_dict +jieba_dict = get_jieba_dict(min_freq=100) +print("jiaba词典中的词频>100的词语数:", len(jieba_dict)) +text = "1979-1998-2020的喜宝们 我现在记忆不太好,大概是拍戏时摔坏了~有什么笔记都要当下写下来。前几天翻看,找着了当时记下的话.我觉得喜宝既不娱乐也不启示,但这就是生活就是人生,10/16来看喜宝吧" +new_words_info = ht.word_discover(text, + excluding_words=set(jieba_dict), # 排除词典已有词语 + exclude_number=True) # 排除数字(默认True) +new_words = new_words_info.index.tolist() +print(new_words) # ['喜宝'] +``` +
+
+ [根据反馈更新](https://github.com/blmoistawinde/HarvestText/issues/13#issue-551894838) 原本默认接受一个单独的字符串,现在也可以接受字符串列表输入,会自动进行拼接 [根据反馈更新](https://github.com/blmoistawinde/HarvestText/issues/14#issuecomment-576081430) 现在默认按照词频降序排序,也可以传入`sort_by='score'`参数,按照综合质量评分排序。 @@ -802,3 +850,5 @@ we imagine what we'll find, in another life. [EventTriplesExtraction](https://github.com/liuhuanyong/EventTriplesExtraction) +[textrank4ZH](https://github.com/letiantian/TextRank4ZH) + diff --git a/examples/basics.py b/examples/basics.py index ab538e0..cec6774 100644 --- a/examples/basics.py +++ b/examples/basics.py @@ -1,6 +1,7 @@ #coding=utf-8 import re from harvesttext import HarvestText + ht = HarvestText() def new_word_discover(): @@ -398,29 +399,80 @@ def test_english(): # for sent0 in sentences: # print(sent0, ht_eng.analyse_sent(sent0)) - +def jieba_dict_new_word(): + from harvesttext.resources import get_jieba_dict + jieba_dict = get_jieba_dict(min_freq=100) + print("jiaba词典中的词频>100的词语数:", len(jieba_dict)) + text = "1979-1998-2020的喜宝们 我现在记忆不太好,大概是拍戏时摔坏了~有什么笔记都要当下写下来。前几天翻看,找着了当时记下的话.我觉得喜宝既不娱乐也不启示,但这就是生活就是人生,10/16来看喜宝吧" + new_words_info = ht.word_discover(text, + excluding_words=set(jieba_dict), # 排除词典已有词语 + exclude_number=True) # 排除数字(默认True) + new_words = new_words_info.index.tolist() + print(new_words) # ['喜宝'] + +def extract_keywords(): + text = """ +好好爱自己 就有人会爱你 +这乐观的说词 +幸福的样子 我感觉好真实 +找不到形容词 +沉默在掩饰 快泛滥的激情 +只剩下语助词 +有一种踏实 当你口中喊我名字 +落叶的位置 谱出一首诗 +时间在消逝 我们的故事开始 +这是第一次 +让我见识爱情 可以慷慨又自私 +你是我的关键词 +我不太确定 爱最好的方式 +是动词或名词 +很想告诉你 最赤裸的感情 +却又忘词 +聚散总有时 而哭笑也有时 +我不怕潜台词 +有一种踏实 是你心中有我名字 +落叶的位置 谱出一首诗 +时间在消逝 我们的故事开始 +这是第一次 +让我见识爱情 可以慷慨又自私 +你是我的关键词 +你藏在歌词 代表的意思 +是专有名词 +落叶的位置 谱出一首诗 +我们的故事 才正要开始 +这是第一次 +爱一个人爱得 如此慷慨又自私 +你是我的关键 + """ + print("《关键词》里的关键词") + kwds = ht.extract_keywords(text, 5, method="jieba_tfidf") + print("jieba_tfidf", kwds) + kwds = ht.extract_keywords(text, 5, method="textrank") + print("textrank", kwds) if __name__ == "__main__": - test_english() - new_word_discover() - new_word_register() - entity_segmentation() - sentiment_dict() - sentiment_dict_default() - entity_search() - text_summarization() - entity_network() - save_load_clear() - load_resources() - linking_strategy() - find_with_rules() - load_resources() - using_typed_words() - build_word_ego_graph() - entity_error_check() - depend_parse() - named_entity_recognition() - el_keep_all() - filter_el_with_rule() - clean_text() - cut_paragraph() + # test_english() + # new_word_discover() + # new_word_register() + # entity_segmentation() + # sentiment_dict() + # sentiment_dict_default() + # entity_search() + # text_summarization() + # entity_network() + # save_load_clear() + # load_resources() + # linking_strategy() + # find_with_rules() + # load_resources() + # using_typed_words() + # build_word_ego_graph() + # entity_error_check() + # depend_parse() + # named_entity_recognition() + # el_keep_all() + # filter_el_with_rule() + # clean_text() + # cut_paragraph() + # jieba_dict_new_word() + extract_keywords() diff --git a/examples/kwd_benchmark/CSL.ipynb b/examples/kwd_benchmark/CSL.ipynb new file mode 100644 index 0000000..9c0bb0a --- /dev/null +++ b/examples/kwd_benchmark/CSL.ipynb @@ -0,0 +1,523 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python_defaultSpec_1602139106579", + "display_name": "Python 3.6.9 64-bit ('py36': conda)" + } + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "source": [ + "# HarvestText中的关键词算法benchmark" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "import numpy as np\n", + "import networkx as nx\n", + "from tqdm import tqdm\n", + "import jieba\n", + "from collections import defaultdict\n", + "from harvesttext import HarvestText" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "ht = HarvestText()" + ] + }, + { + "source": [ + "首先,选取的数据集是CLUE整理的CSL关键词预测数据集([下载地址](https://github.com/CLUEbenchmark/CLUE#6-csl-%E8%AE%BA%E6%96%87%E5%85%B3%E9%94%AE%E8%AF%8D%E8%AF%86%E5%88%AB-keyword-recognition))。需要先下载并放到本目录的`CSL关键词预测`文件夹下\n", + "\n", + "在上面先在开发集上做一些基本的分析及调参。" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "3000" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "data_dev = []\n", + "with open('CSL关键词预测/dev.json', encoding='utf-8') as f:\n", + " for line in f:\n", + " tmp = json.loads(line)\n", + " data_dev.append((tmp['abst'], tmp['keyword']))\n", + "len(data_dev)" + ] + }, + { + "source": [ + "一些基础的数据探索性分析(EDA)\n", + "- 每个文档的关键词个数\n", + "- 关键词的长度分布\n", + "- 考察分词`seg`的情况和不分词`nseg`的情况,有多少比例的关键词被覆盖。这决定了依赖分词和不依赖分词的算法所能达到的理论recall上限。" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "all_keywords = 0\n", + "recalls = {'seg':0, 'nseg':0}\n", + "kwd_cnt = defaultdict(int)\n", + "kwd_len_cnt = defaultdict(int)\n", + "for abst, kwds in data_dev:\n", + " kwd_cnt[len(kwds)] += 1\n", + " words = set(jieba.lcut(abst))\n", + " all_keywords += len(kwds)\n", + " recalls['seg'] += len(set(kwds) & words)\n", + " recalls['nseg'] += sum(int(kwd in abst) for kwd in kwds)\n", + " for kwd in kwds:\n", + " kwd_len_cnt[len(kwd)] += 1\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "defaultdict(, {4: 1814, 3: 1128, 2: 58})\n" + } + ], + "source": [ + "print(kwd_cnt)" + ] + }, + { + "source": [ + "每篇文档的关键词数量在2-4之间" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "1 0.004277\n2 0.260134\n3 0.387970\n4 0.702864\n5 0.812756\n6 0.904239\n7 0.937151\n8 0.956489\n9 0.971551\n10 0.980104\n11 0.988100\n12 0.991633\n13 0.995258\n14 0.995816\n15 0.996281\n16 0.997583\n17 0.998791\n18 0.999256\n19 0.999442\n20 0.999907\n31 1.000000\ndtype: float64" + }, + "metadata": {}, + "execution_count": 14 + } + ], + "source": [ + "# 关键词长度的累积概率分布\n", + "pd.Series(kwd_len_cnt).sort_index().cumsum() / sum(kwd_len_cnt.values())" + ] + }, + { + "source": [ + "存在很长的关键词,以一个词而不是多词词组为单元的关键词算法无法处理这些情况,不过4个字以内也已经可以覆盖70%" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "{'seg': 0.3697471178876906, 'nseg': 0.7791000371885459}\n" + } + ], + "source": [ + "for k in recalls:\n", + " recalls[k] /= all_keywords\n", + "print(recalls)" + ] + }, + { + "source": [ + "上述情况说明,依赖jieba分词的算法在这个数据集上最多只能达到36.97%的recall,而其他从原文直接中抽取方法(新词发现,序列标注等)有可能达到77.91%。\n", + "\n", + "下面的算法,因此在数值上不会有很好的表现,不过依旧可以为比较和调参提供一些参考。" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "给出一个关键词抽取的示例,包括`textrank`和HarvestText封装jieba并配置好参数和停用词的`jieba_tfidf`。" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "随机噪声雷达通常利用时域相关完成脉冲压缩从而进行目标检测.该文根据压缩感知理论提出一种适用于噪声雷达目标检测的新算法,它用低维投影测量和信号重建取代了传统的相关操作和压缩处理,将大量运算转移到后期处理.该算法以噪声雷达所检测的目标空间分布满足稀疏性为前提;利用发射信号形成卷积矩阵,然后通过随机抽取卷积矩阵的行构建测量矩阵;并采用迭代收缩阈值算法实现目标信号重建.该文对算法作了详细的理论推导,形成完整的实现框架.仿真实验验证了算法的有效性,并分析了对处理结果影响较大的因素.该算法能够有效地重建目标,具有良好的运算效率.与时域相关法相比,大幅度减小了目标检测误差,有效抑制了输出旁瓣,并保持了信号的相位特性.\n真实关键词:['目标', '相关', '矩阵']\njieba_tfidf 关键词(前5):['算法', '矩阵', '检测', '目标', '信号']\ntextrank 关键词(前5):['算法', '信号', '目标', '压缩', '矩阵']\n" + } + ], + "source": [ + "text, kwds = data_dev[10]\n", + "print(text)\n", + "print(\"真实关键词:\", kwds)\n", + "print(\"jieba_tfidf 关键词(前5):\", ht.extract_keywords(text, 5, method=\"jieba_tfidf\"))\n", + "print(\"textrank 关键词(前5):\", ht.extract_keywords(text, 5, method=\"textrank\"))" + ] + }, + { + "source": [ + "每篇文章取前5个作为预测值,我们可以得到precision@5, recall@5, F1@5来评估算法的效果" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_keywords = 0\n", + "pred_keywords = 0\n", + "recall_new_word = 0" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": "100%|██████████| 3000/3000 [00:29<00:00, 100.76it/s]\njieba Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\n" + } + ], + "source": [ + "topK = 5\n", + "ref_keywords, pred_keywords = 0, 0\n", + "acc_count = 0\n", + "for text, kwds in tqdm(data_dev):\n", + " ref_keywords += len(kwds)\n", + " pred_keywords += topK\n", + " preds = ht.extract_keywords(text, topK, method=\"jieba_tfidf\")\n", + " acc_count += len(set(kwds) & set(preds))\n", + "prec = acc_count / pred_keywords\n", + "recall = acc_count / ref_keywords\n", + "f1 = 2*prec*recall/(prec+recall)\n", + "print(f\"jieba Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "jieba Precison:0.1060, Recall:0.1478, F1:0.1235\n" + } + ], + "source": [ + "print(f\"jieba Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")" + ] + }, + { + "source": [ + "Textrank调参" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": "100%|██████████| 3000/3000 [00:45<00:00, 66.11it/s]\ntextrank[block: doc, window:2, weighted:False] Precison:0.0942, Recall:0.1314, F1:0.1097\n100%|██████████| 3000/3000 [00:46<00:00, 64.20it/s]\ntextrank[block: doc, window:2, weighted:True] Precison:0.0955, Recall:0.1332, F1:0.1113\n100%|██████████| 3000/3000 [00:41<00:00, 71.53it/s]\ntextrank[block: doc, window:3, weighted:False] Precison:0.0948, Recall:0.1322, F1:0.1104\n100%|██████████| 3000/3000 [00:41<00:00, 65.70it/s]\ntextrank[block: doc, window:3, weighted:True] Precison:0.0945, Recall:0.1318, F1:0.1101\n100%|██████████| 3000/3000 [00:41<00:00, 72.11it/s]\ntextrank[block: doc, window:4, weighted:False] Precison:0.0944, Recall:0.1316, F1:0.1100\n100%|██████████| 3000/3000 [00:41<00:00, 71.65it/s]\ntextrank[block: doc, window:4, weighted:True] Precison:0.0939, Recall:0.1309, F1:0.1093\n100%|██████████| 3000/3000 [00:45<00:00, 66.37it/s]\ntextrank[block: sent, window:2, weighted:False] Precison:0.0931, Recall:0.1299, F1:0.1085\n100%|██████████| 3000/3000 [00:45<00:00, 65.93it/s]\ntextrank[block: sent, window:2, weighted:True] Precison:0.0945, Recall:0.1318, F1:0.1101\n100%|██████████| 3000/3000 [00:41<00:00, 53.28it/s]\ntextrank[block: sent, window:3, weighted:False] Precison:0.0936, Recall:0.1305, F1:0.1090\n100%|██████████| 3000/3000 [00:40<00:00, 73.21it/s]\ntextrank[block: sent, window:3, weighted:True] Precison:0.0929, Recall:0.1295, F1:0.1082\n100%|██████████| 3000/3000 [00:40<00:00, 73.50it/s]\ntextrank[block: sent, window:4, weighted:False] Precison:0.0931, Recall:0.1298, F1:0.1084\n100%|██████████| 3000/3000 [00:41<00:00, 72.45it/s]\ntextrank[block: sent, window:4, weighted:True] Precison:0.0925, Recall:0.1290, F1:0.1077\n" + } + ], + "source": [ + "from itertools import product\n", + "\n", + "topK = 5\n", + "block_types = [\"doc\", \"sent\"]\n", + "window_sizes = [2, 3, 4]\n", + "if_weighted = [False, True]\n", + "for block_type, window, weighted in product(block_types, window_sizes, if_weighted):\n", + " ref_keywords, pred_keywords = 0, 0\n", + " acc_count = 0\n", + " for text, kwds in tqdm(data_dev):\n", + " ref_keywords += len(kwds)\n", + " pred_keywords += topK\n", + " preds = ht.extract_keywords(text, topK, method=\"textrank\", block_type=block_type, window=window, weighted=weighted)\n", + " acc_count += len(set(kwds) & set(preds))\n", + " prec = acc_count / pred_keywords\n", + " recall = acc_count / ref_keywords\n", + " f1 = 2*prec*recall/(prec+recall)\n", + " print(f\"textrank[block: {block_type}, window:{window}, weighted:{weighted}] Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")" + ] + }, + { + "source": [ + "textrank的最佳参数是 block: doc, window:2, weighted:True\n", + "\n", + "precision和recall与jieba_tfidf还是有差距,可能是因为后者拥有从大量语料库中统计得到的idf数据能起到一定帮助" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "## 测试集benchmark\n", + "\n", + "选取各个算法的最佳参数在测试集上获得最终表现" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "3000" + }, + "metadata": {}, + "execution_count": 22 + } + ], + "source": [ + "data_test = []\n", + "with open('CSL关键词预测/test.json', encoding='utf-8') as f:\n", + " for line in f:\n", + " tmp = json.loads(line)\n", + " data_test.append((tmp['abst'], tmp['keyword']))\n", + "len(data_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": "100%|██████████| 3000/3000 [00:30<00:00, 99.11it/s]\njieba Precison:0.1035, Recall:0.1453, F1:0.1209\n" + } + ], + "source": [ + "topK = 5\n", + "ref_keywords, pred_keywords = 0, 0\n", + "acc_count = 0\n", + "for text, kwds in tqdm(data_test):\n", + " ref_keywords += len(kwds)\n", + " pred_keywords += topK\n", + " preds = ht.extract_keywords(text, topK, method=\"jieba_tfidf\")\n", + " acc_count += len(set(kwds) & set(preds))\n", + "prec = acc_count / pred_keywords\n", + "recall = acc_count / ref_keywords\n", + "f1 = 2*prec*recall/(prec+recall)\n", + "print(f\"jieba Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": "100%|██████████| 3000/3000 [00:45<00:00, 65.51it/s]\ntextrank Precison:0.0955, Recall:0.1342, F1:0.1116\n" + } + ], + "source": [ + "topK = 5\n", + "ref_keywords, pred_keywords = 0, 0\n", + "acc_count = 0\n", + "for text, kwds in tqdm(data_test):\n", + " ref_keywords += len(kwds)\n", + " pred_keywords += topK\n", + " preds = ht.extract_keywords(text, topK, method=\"textrank\", block_size=\"doc\", window=2, weighted=True)\n", + " acc_count += len(set(kwds) & set(preds))\n", + "prec = acc_count / pred_keywords\n", + "recall = acc_count / ref_keywords\n", + "f1 = 2*prec*recall/(prec+recall)\n", + "print(f\"textrank Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")" + ] + }, + { + "source": [ + "另,附上HarvestText与另一个流行的textrank的实现,[textrank4zh](https://github.com/letiantian/TextRank4ZH)的比较" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "随机噪声雷达通常利用时域相关完成脉冲压缩从而进行目标检测.该文根据压缩感知理论提出一种适用于噪声雷达目标检测的新算法,它用低维投影测量和信号重建取代了传统的相关操作和压缩处理,将大量运算转移到后期处理.该算法以噪声雷达所检测的目标空间分布满足稀疏性为前提;利用发射信号形成卷积矩阵,然后通过随机抽取卷积矩阵的行构建测量矩阵;并采用迭代收缩阈值算法实现目标信号重建.该文对算法作了详细的理论推导,形成完整的实现框架.仿真实验验证了算法的有效性,并分析了对处理结果影响较大的因素.该算法能够有效地重建目标,具有良好的运算效率.与时域相关法相比,大幅度减小了目标检测误差,有效抑制了输出旁瓣,并保持了信号的相位特性.\n真实关键词:['目标', '相关', '矩阵']\ntextrank4zh 关键词(前5):['算法', '信号', '目标', '压缩', '运算']\n" + } + ], + "source": [ + "from textrank4zh import TextRank4Keyword\n", + "\n", + "def textrank4zh(text, topK, window=2):\n", + " # same as used in ht\n", + " allowPOS = {'n', 'ns', 'nr', 'nt', 'nz', 'vn', 'v', 'an', 'a', 'i'}\n", + " tr4w = TextRank4Keyword(allow_speech_tags=allowPOS)\n", + " tr4w.analyze(text=text, lower=True, window=window)\n", + " return [item.word for item in tr4w.get_keywords(topK)]\n", + "\n", + "text, kwds = data_dev[10]\n", + "print(text)\n", + "print(\"真实关键词:\", kwds)\n", + "print(\"textrank4zh 关键词(前5):\", textrank4zh(text, 5))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": "100%|██████████| 3000/3000 [02:12<00:00, 24.17it/s]\ntextrank4zh Precison:0.0836, Recall:0.1174, F1:0.0977\n" + } + ], + "source": [ + "topK = 5\n", + "ref_keywords, pred_keywords = 0, 0\n", + "acc_count = 0\n", + "for text, kwds in tqdm(data_test):\n", + " ref_keywords += len(kwds)\n", + " pred_keywords += topK\n", + " preds = textrank4zh(text, topK)\n", + " acc_count += len(set(kwds) & set(preds))\n", + "prec = acc_count / pred_keywords\n", + "recall = acc_count / ref_keywords\n", + "f1 = 2*prec*recall/(prec+recall)\n", + "print(f\"textrank4zh Precison:{prec:.4f}, Recall:{recall:.4f}, F1:{f1:.4f}\")" + ] + }, + { + "source": [ + "HarvestText的textrank的实现在精度和速度上都有一定的优势。" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "source": [ + "总结各个算法在CSL数据及上的结果:\n", + "\n", + "| 算法 | P@5 | R@5 | F@5 |\n", + "| --- | --- | --- | --- |\n", + "| textrank4zh | 0.0836 | 0.1174 | 0.0977 |\n", + "| ht_textrank | 0.0955 | 0.1342 | 0.1116 |\n", + "| ht_jieba_tfidf | **0.1035** | **0.1453** | **0.1209** |\n", + "\n", + "综上,HarvestText的关键词抽取功能\n", + "- 把配置好参数的jieba_tfidf作为默认方法\n", + "- 使用自己的textrank实现而不是用流行的textrank4zh。" + ], + "cell_type": "markdown", + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ] +} \ No newline at end of file diff --git a/harvesttext/algorithms/keyword.py b/harvesttext/algorithms/keyword.py new file mode 100644 index 0000000..0db95c3 --- /dev/null +++ b/harvesttext/algorithms/keyword.py @@ -0,0 +1,36 @@ +import numpy as np +import networkx as nx + +def combine(word_list, window = 2): + """构造在window下的单词组合,用来构造单词之间的边。 + + :params word_list: list of str, 由单词组成的列表。 + :params window: int, 窗口大小。 + """ + if window < 2: window = 2 + for x in range(1, window): + if x >= len(word_list): + break + word_list2 = word_list[x:] + res = zip(word_list, word_list2) + for r in res: + yield r + +def textrank(block_words, topK, with_score=False, window=2, weighted=False): + G = nx.Graph() + for word_list in block_words: + for u, v in combine(word_list, window): + if not weighted: + G.add_edge(u, v) + else: + if G.has_edge(u, v): + G[u][v]['weight'] += 1 + else: + G.add_edge(u, v, weight=1) + + pr = nx.pagerank_scipy(G) + pr_sorted = sorted(pr.items(), key=lambda x: x[1], reverse=True) + if with_score: + return pr_sorted[:topK] + else: + return [w for (w, imp) in pr_sorted[:topK]] \ No newline at end of file diff --git a/harvesttext/algorithms/word_discoverer.py b/harvesttext/algorithms/word_discoverer.py index 7283856..b0376cc 100644 --- a/harvesttext/algorithms/word_discoverer.py +++ b/harvesttext/algorithms/word_discoverer.py @@ -202,11 +202,13 @@ def genWords2(self, doc): v.left = entropyOfList(v.left) v.right = entropyOfList(v.right) return values - def get_df_info(self, ex_mentions): + def get_df_info(self, ex_mentions, exclude_number=True): info = {"text":[],"freq":[],"left_ent":[],"right_ent":[],"agg":[]} for w in self.word_infos: if w.text in ex_mentions: continue + if exclude_number and w.text.isdigit(): + continue info["text"].append(w.text) info["freq"].append(w.freq) info["left_ent"].append(w.left) diff --git a/harvesttext/download_utils.py b/harvesttext/download_utils.py new file mode 100644 index 0000000..e0360ac --- /dev/null +++ b/harvesttext/download_utils.py @@ -0,0 +1,135 @@ +import os +import shutil +import requests +import hashlib +from tqdm import tqdm +from collections import namedtuple +from os import environ, listdir, makedirs +from os.path import dirname, exists, expanduser, isdir, join, splitext + +RemoteFileMetadata = namedtuple('RemoteFileMetadata', + ['filename', 'url', 'checksum']) + +# config according to computer, this should be default setting of shadowsocks +DEFAULT_PROXIES = { + 'http': 'socks5h://127.0.0.1:1080', + 'https': 'socks5h://127.0.0.1:1080' +} + +def get_data_home(data_home=None): + """Return the path of the scikit-learn data dir. + This folder is used by some large dataset loaders to avoid downloading the + data several times. + By default the data dir is set to a folder named 'scikit_learn_data' in the + user home folder. + Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment + variable or programmatically by giving an explicit folder path. The '~' + symbol is expanded to the user home folder. + If the folder does not already exist, it is automatically created. + Parameters + ---------- + data_home : str | None + The path to data dir. + """ + if data_home is None: + data_home = environ.get('HARVESTTEXT_DATA', + join('~', '.harvesttext')) + data_home = expanduser(data_home) + if not exists(data_home): + makedirs(data_home) + return data_home + +def clear_data_home(data_home=None): + """Delete all the content of the data home cache. + Parameters + ---------- + data_home : str | None + The path to data dir. + """ + data_home = get_data_home(data_home) + shutil.rmtree(data_home) + +def _sha256(path): + """Calculate the sha256 hash of the file at path.""" + sha256hash = hashlib.sha256() + chunk_size = 8192 + with open(path, "rb") as f: + while True: + buffer = f.read(chunk_size) + if not buffer: + break + sha256hash.update(buffer) + return sha256hash.hexdigest() + +def _download_with_bar(url, file_path, proxies=DEFAULT_PROXIES): + # Streaming, so we can iterate over the response. + response = requests.get(url, stream=True, proxies=proxies) + total_size_in_bytes= int(response.headers.get('content-length', 0)) + block_size = 1024 # 1 KB + progress_bar = tqdm(total=total_size_in_bytes, unit='B', unit_scale=True) + with open(file_path, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + raise Exception("ERROR, something went wrong with the downloading") + return file_path + +def _fetch_remote(remote, dirname=None, use_proxy=False, proxies=DEFAULT_PROXIES): + """Helper function to download a remote dataset into path + Fetch a dataset pointed by remote's url, save into path using remote's + filename and ensure its integrity based on the SHA256 Checksum of the + downloaded file. + Parameters + ---------- + remote : RemoteFileMetadata + Named tuple containing remote dataset meta information: url, filename + and checksum + dirname : string + Directory to save the file to. + Returns + ------- + file_path: string + Full path of the created file. + """ + + file_path = (remote.filename if dirname is None + else join(dirname, remote.filename)) + proxies = None if not use_proxy else proxies + file_path = _download_with_bar(remote.url, file_path, proxies) + checksum = _sha256(file_path) + if remote.checksum != checksum: + raise IOError("{} has an SHA256 checksum ({}) " + "differing from expected ({}), " + "file may be corrupted.".format(file_path, checksum, + remote.checksum)) + return file_path + + +def download(remote, file_path=None, use_proxy=False, proxies=DEFAULT_PROXIES): + data_home = get_data_home() + file_path = _fetch_remote(remote, data_home, use_proxy, proxies) + return file_path + +def check_download_resource(remote, use_proxy=False, proxies=None): + proxies = DEFAULT_PROXIES if use_proxy and proxies is None else proxies + data_home = get_data_home() + file_path = os.path.join(data_home, remote.filename) + if not os.path.exists(file_path): + # currently don't capture error at this level, assume download success + file_path = download(remote, data_home) + return file_path + +if __name__ == "__main__": + ARCHIVE = RemoteFileMetadata( + filename='harvesttext-0.7.2-py3-none-any.whl', + url='https://github.com/blmoistawinde/HarvestText/releases/download/V0.7.2/harvesttext-0.7.2-py3-none-any.whl', + checksum='004c8b0b1858f69025f721bc84cff33127d53c6ab526beed7a7a801a9c21f30b') + print("Download") + file_path = download(ARCHIVE) + print(file_path) + # if proxy is available + # print("Download using proxy") + # file_path = download(ARCHIVE, use_proxy=True) + # print(file_path) \ No newline at end of file diff --git a/harvesttext/harvesttext.py b/harvesttext/harvesttext.py index 5698414..2579969 100644 --- a/harvesttext/harvesttext.py +++ b/harvesttext/harvesttext.py @@ -53,6 +53,7 @@ def __init__(self, standard_name=False, language='zh_CN'): self.pinyin_adjlist = json.load(f) self.language = language if language == "en": + import nltk try: nltk.data.find('taggers/averaged_perceptron_tagger') except: @@ -774,7 +775,7 @@ def clean_text(self, text, remove_url=True, email=True, weibo_at=True, stop_term if t2s: cc = OpenCC('t2s') text = cc.convert(text) - assert hasattr(stop_terms, "__init__"), Exception("去除的词语必须是一个可迭代对象") + assert hasattr(stop_terms, "__iter__"), Exception("去除的词语必须是一个可迭代对象") if type(stop_terms) == str: text = text.replace(stop_terms, "") else: diff --git a/harvesttext/parsing.py b/harvesttext/parsing.py index 63c7fce..171be03 100644 --- a/harvesttext/parsing.py +++ b/harvesttext/parsing.py @@ -139,7 +139,7 @@ def cut_paragraphs(self, text, num_paras=None, block_sents=3, std_weight=0.5, if num_paras is not None: assert num_paras > 0, "Should give a positive number of num_paras" assert stopwords == 'baidu' or (hasattr(stopwords, '__iter__') and type(stopwords) != str) - stopwords = get_baidu_stopwords() if stopwords == 'baidu' else stopwords + stopwords = get_baidu_stopwords() if stopwords == 'baidu' else set(stopwords) if seq_chars < 1: cut_seqs = lambda x: self.cut_sentences(x, **kwargs) else: diff --git a/harvesttext/resources.py b/harvesttext/resources.py index 623329b..84ff3de 100644 --- a/harvesttext/resources.py +++ b/harvesttext/resources.py @@ -10,6 +10,7 @@ # 李军 中文评论的褒贬义分类实验研究 硕士论文 清华大学 2008 import os import json +from collections import defaultdict def get_qh_sent_dict(): """ @@ -123,4 +124,31 @@ def get_english_senti_lexicon(type="LH"): senti_lexicon = json.load(f) return senti_lexicon - +def get_jieba_dict(min_freq=0, max_freq=float('inf'), with_pos=False, use_proxy=False, proxies=None): + """ + 获得jieba自带的中文词语词频词典 + + :params min_freq: 选取词语需要的最小词频 + :params max_freq: 选取词语允许的最大词频 + :params with_pos: 返回结果是否包括词性信息 + :return if not with_pos, dict of {wd: freq}, else, dict of {(wd, pos): freq} + """ + from .download_utils import RemoteFileMetadata, check_download_resource + remote = RemoteFileMetadata( + filename='jieba_dict.txt', + url='https://github.com/blmoistawinde/HarvestText/releases/download/V0.8/jieba_dict.txt', + checksum='7197c3211ddd98962b036cdf40324d1ea2bfaa12bd028e68faa70111a88e12a8') + file_path = check_download_resource(remote, use_proxy, proxies) + ret = defaultdict(int) + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + if len(line.strip().split()) == 3: + wd, freq, pos = line.strip().split() + freq = int(freq) + if freq > min_freq and freq < max_freq: + if not with_pos: + ret[wd] = freq + else: + ret[(wd, pos)] = freq + return ret + \ No newline at end of file diff --git a/harvesttext/word_discover.py b/harvesttext/word_discover.py index b302ec4..087e57d 100644 --- a/harvesttext/word_discover.py +++ b/harvesttext/word_discover.py @@ -1,4 +1,7 @@ +import jieba +import jieba.analyse import logging +import networkx as nx import numpy as np import pandas as pd from collections import defaultdict @@ -6,6 +9,7 @@ from .resources import get_baidu_stopwords from .algorithms.word_discoverer import WordDiscoverer from .algorithms.entity_discoverer import NFLEntityDiscoverer, NERPEntityDiscover +from .algorithms.keyword import textrank class WordDiscoverMixin: """ @@ -18,7 +22,7 @@ class WordDiscoverMixin: def word_discover(self, doc, threshold_seeds=[], auto_param=True, excluding_types=[], excluding_words='baidu_stopwords', # 可以排除已经登录的某些种类的实体,或者某些指定词 max_word_len=5, min_freq=0.00005, min_entropy=1.4, min_aggregation=50, - ent_threshold="both", mem_saving=None, sort_by='freq'): + ent_threshold="both", mem_saving=None, sort_by='freq', exclude_number=True): '''新词发现,基于 http://www.matrix67.com/blog/archives/5044 实现及微调 :param doc: (string or list) 待进行新词发现的语料,如果是列表的话,就会自动用换行符拼接 @@ -33,6 +37,7 @@ def word_discover(self, doc, threshold_seeds=[], auto_param=True, :param ent_threshold: "both": (默认)在使用左右交叉熵进行筛选时,两侧都必须超过阈值; "avg": 两侧的平均值达到阈值即可 :param mem_saving: bool or None, 采用一些过滤手段来减少内存使用,但可能影响速度。如果不指定,对长文本自动打开,而对短文本不使用 :param sort_by: 以下string之一: {'freq': 词频, 'score': 综合分数, 'agg':凝聚度} 按照特定指标对得到的词语信息排序,默认使用词频 + :param exclude_number: (默认True)过滤发现的纯数字新词 :return: info: 包含新词作为index, 以及对应各项指标的DataFrame ''' if type(doc) != str: @@ -72,7 +77,7 @@ def word_discover(self, doc, threshold_seeds=[], auto_param=True, else: ex_mentions |= set(excluding_words) - info = ws.get_df_info(ex_mentions) + info = ws.get_df_info(ex_mentions, exclude_number) # 利用种子词来确定筛选优质新词的标准,种子词中最低质量的词语将被保留(如果一开始就被找到的话) if len(threshold_seeds) > 0: @@ -234,4 +239,66 @@ def entity_discover(self, text, return_count=False, method="NFL", min_count=5, p return entity_mention_dict, entity_type_dict, mention_count else: return entity_mention_dict, entity_type_dict + + def extract_keywords(self, text, topK, with_score=False, min_word_len=2, stopwords="baidu", allowPOS="default", method="jieba_tfidf", **kwargs): + """用各种算法抽取关键词(目前均为无监督),结合了ht的实体分词来提高准确率 + 目前支持的算法类型(及额外参数): + + - jieba_tfidf: (默认)jieba自带的基于tfidf的关键词抽取算法,idf统计信息来自于其语料库 + - textrank: 基于textrank的关键词抽取算法 + - block_type: 默认"doc"。 支持三种级别,"sent", "para", "doc",每个block之间的临近词语不建立连边 + - window: 默认2, 邻接的几个词语之内建立连边 + - weighted: 默认False, 时候使用加权图计算textrank + - 构建词图时会过滤不符合min_word_len, stopwords, allowPOS要求的词语 + + :params text: 从中挖掘关键词的文档 + :params topK: int, 从每个文档中抽取的关键词(最大)数量 + :params with_score: bool, 默认False, 是否同时返回算法提供的分数(如果有的话) + :params min_word_len: 默认2, 被纳入关键词的词语不低于此长度 + :param stopwords: 字符串列表/元组/集合,或者'baidu'为默认百度停用词,在算法中引入的停用词,一般能够提升准确度 + :params allowPOS: iterable of str,关键词应当属于的词性,默认为"default" {'n', 'ns', 'nr', 'nt', 'nz', 'vn', 'v', 'an', 'a', 'i'}以及已登录的实体词类型 + :params method: 选择用于抽取的算法,目前支持"jieba_tfidf", "tfidf", "textrank" + :params kwargs: 其他算法专属参数 + + + """ + assert method in {"jieba_tfidf", "textrank"}, print("目前不支持的算法") + if allowPOS == 'default': + # ref: 结巴分词标注兼容_ICTCLAS2008汉语词性标注集 https://www.cnblogs.com/hpuCode/p/4416186.html + allowPOS = {'n', 'ns', 'nr', 'nt', 'nz', 'vn', 'v', 'an', 'a', 'i'} + else: + assert hasattr(allowPOS, "__iter__") + # for HT, we consider registered entity types specifically + allowPOS |= set(self.type_entity_mention_dict) + + assert stopwords == 'baidu' or (hasattr(stopwords, '__iter__') and type(stopwords) != str) + stopwords = get_baidu_stopwords() if stopwords == 'baidu' else set(stopwords) + + if method == "jieba_tfidf": + kwds = jieba.analyse.extract_tags(text, topK=int(2*topK), allowPOS=allowPOS, withWeight=with_score) + if with_score: + kwds = [(kwd, score) for (kwd, score) in kwds if kwd not in stopwords][:topK] + else: + kwds = kwds[:topK] + elif method == "textrank": + block_type = kwargs.get("block_type", "doc") + assert block_type in {"sent", "para", "doc"} + window = kwargs.get("window", 2) + weighted = kwargs.get("weighted", True) + if block_type == "doc": + blocks = [text] + elif block_type == "para": + blocks = [para.strip() for para in text.split("\n") if para.strip() != ""] + elif block_type == "sent": + blocks = self.cut_sentences(text) + block_pos = (self.posseg(block.strip(), stopwords=stopwords) for block in blocks) + block_words = [[wd for wd, pos in x + if pos in allowPOS and len(wd) >= min_word_len] + for x in block_pos] + kwds = textrank(block_words, topK, with_score, window, weighted) + + return kwds + + + \ No newline at end of file