-
Notifications
You must be signed in to change notification settings - Fork 84
/
TFIDF_space.py
48 lines (39 loc) · 1.83 KB
/
TFIDF_space.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@version: python3.6
@author: XiangguoSun
@contact: [email protected]
@file: TFIDF_space.py
@time: 2018/1/23 16:12
@software: PyCharm
"""
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
from Tools import readfile, readbunchobj, writebunchobj
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):
stpwrdlst = readfile(stopword_path).splitlines()
bunch = readbunchobj(bunch_path)
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[],
vocabulary={})
if train_tfidf_path is not None:
trainbunch = readbunchobj(train_tfidf_path)
tfidfspace.vocabulary = trainbunch.vocabulary
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,
vocabulary=trainbunch.vocabulary)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
else:
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
writebunchobj(space_path, tfidfspace)
print("if-idf词向量空间实例创建成功!!!")
if __name__ == '__main__':
stopword_path = "train_word_bag/hlt_stop_words.txt"
bunch_path = "train_word_bag/train_set.dat"
space_path = "train_word_bag/tfdifspace.dat"
vector_space(stopword_path, bunch_path, space_path)
bunch_path = "test_word_bag/test_set.dat"
space_path = "test_word_bag/testspace.dat"
train_tfidf_path = "train_word_bag/tfdifspace.dat"
vector_space(stopword_path, bunch_path, space_path, train_tfidf_path)