-
Notifications
You must be signed in to change notification settings - Fork 104
/
Copy pathword_analyze.py
55 lines (46 loc) · 1.8 KB
/
word_analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import jieba.posseg as psg
import os
# 根据词频降序排序
def get_frequency(lst):
word_frequency = {}
for word in lst:
if word in word_frequency:
word_frequency[word] += 1
else:
word_frequency[word] = 1
word_frequency_list = sorted(
word_frequency.items(), key=lambda x: x[1], reverse=True)
return word_frequency_list
def print_word_frequency(file_path):
if os.path.exists(file_path) and os.path.isfile(file_path):
with open(file=file_path, mode='r', encoding='utf-8') as f:
content = (f.read())
else:
print('文件不存在:{}'.format(file_path))
return
# 2. 分离出感兴趣的名词,放在 lst_words 里
lst_words = []
for x in psg.cut(content):
# 保留名词、人名、地名,长度至少两个字
if x.flag in ['n', 'nr', 'ns'] and len(x.word) > 1:
lst_words.append(x.word)
# 3. 按照词频由大到小排序,放在 lst_sorted 里
frequen_list = get_frequency(lst_words)
# 4. 打印 TOP10
# 使柱图不太长或太短
divide = 50
# 词汇个数超过十个且第十个词汇的频率低于 divide
if len(frequen_list) >= 10 and frequen_list[9][1] < divide:
divide = frequen_list[9][1]//2
# 词汇个数不超过十个且最后一个词汇的频率低于 divide
elif len(frequen_list) < 10 and frequen_list[-1][1] < divide:
divide = frequen_list[-1][1]//2
if divide == 0:
divide = 1
print('\n序号\t名词\t词频\t柱图\n')
for i in range(10):
if i < len(frequen_list):
print('{}\t{}\t{}\t{}\n'.format(
i+1, frequen_list[i][0], frequen_list[i][1], '.' * (frequen_list[i][1] // divide)))
if __name__ == "__main__":
print_word_frequency('./README.md')