-
Notifications
You must be signed in to change notification settings - Fork 6
/
Bayes.py
90 lines (78 loc) · 3.29 KB
/
Bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
'''
多类的朴素贝叶斯实现
'''
import os
import re
import jieba
import numpy as np
base_dir = os.path.dirname(__file__)
stop = [line.strip() for line in open(os.path.join(base_dir,'stop.txt'), 'r', encoding='utf-8').readlines()] # 停用词
def build_word_array(line):
line_cut = []
temp = line.strip()
try:
sentence = temp.lstrip() # 每条微博
word_list = []
sentence = str(sentence).replace('\u200b', '')
for word in jieba.cut(sentence.strip()):
p = re.compile(b'\w', re.L)
result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8")
if not result or result == ' ': # 空字符
continue
word_list.append(word)
word_list = list(set(word_list) - set(stop) - set('\u200b')
- set(' ') - set('\u3000') - set('️'))
line_cut.append(word_list)
except Exception:
return None
return line_cut
def build_key_word(path): # 通过词频产生特征
d = {}
with open(path, encoding="utf-8") as fp:
for line in fp:
for word in jieba.cut(line.strip()):
p = re.compile(b'\w', re.L)
result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8")
if not result or result == ' ': # 空字符
continue
if len(word) > 1: # 避免大量无意义的词语进入统计范围
d[word] = d.get(word, 0) + 1
kw_list = sorted(d, key=lambda x: d[x], reverse=True)
size = int(len(kw_list) * 0.2) # 取最前的30%
mood = set(kw_list[:size])
return list(mood - set(stop))
def loadDataSet(path): # 返回每条微博的分词与标签
line_cut = []
label = []
with open(path, encoding="utf-8") as fp:
for line in fp:
temp = line.strip()
try:
sentence = temp[2:].lstrip() # 每条微博
label.append(int(temp[:2])) # 获取标注
word_list = []
sentence = str(sentence).replace('\u200b', '')
for word in jieba.cut(sentence.strip()):
p = re.compile(b'\w', re.L)
result = p.sub(b"", bytes(word, encoding="utf-8")).decode("utf-8")
if not result or result == ' ': # 空字符
continue
word_list.append(word)
word_list = list(set(word_list) - set(stop) - set('\u200b')
- set(' ') - set('\u3000') - set('️'))
line_cut.append(word_list)
except Exception:
continue
return line_cut, label # 返回每条微博的分词和标注
def setOfWordsToVecTor(vocabularyList, moodWords): # 每条微博向量化
vocabMarked = [0] * len(vocabularyList)
for smsWord in moodWords:
if smsWord in vocabularyList:
vocabMarked[vocabularyList.index(smsWord)] += 1
return np.array(vocabMarked)
def setOfWordsListToVecTor(vocabularyList, train_mood_array): # 将所有微博准备向量化
vocabMarkedList = []
for i in range(len(train_mood_array)):
vocabMarked = setOfWordsToVecTor(vocabularyList, train_mood_array[i])
vocabMarkedList.append(vocabMarked)
return vocabMarkedList