-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
103 lines (76 loc) · 3.13 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import string
import torch
def load_stopwords(path="./Resources/stopwords.txt"):
"""加载禁用词 集合"""
with open(path, "r") as f:
stopwords = [word.strip('\n') for word in f]
# 加入空白字符,字母,各种符号等等
stopwords += list(string.printable)
return set(stopwords)
def load_word2id(length=2000, vocab_path="./datasets/vocab.csv"):
word2id = {"<pad>": 0, "<unk>": 1}
with open(vocab_path, "r", encoding='utf-8') as f:
words = [line.split(',')[0] for line in f]
for word in words[:length]:
word2id[word] = len(word2id)
return word2id
def load_embeddings(word2id, emb_dim=300,
emb_path="./Resources/pretrained_embeddings"):
vocab_size = len(word2id)
embedding = torch.Tensor(vocab_size, emb_dim)
# 从磁盘中加载预训练的embeddings
word2embstr = {}
with open(emb_path, 'r') as f:
for line in f:
word, embstr = line.split(" ", 1)
word2embstr[word] = embstr.strip("\n")
# 找出我们所需要的embedding
for word, word_id in word2id.items():
if word in word2embstr:
embs = list(map(float, word2embstr[word].split()))
embedding[word_id] = torch.Tensor(embs)
else:
embedding[word_id] = torch.randn(emb_dim)
return embedding
def collate_fn_ml(word2id, batch):
"""为ML分类方法提供数据,这里主要是将文本转化为向量"""
labels, sentences = zip(*batch)
labels = torch.LongTensor(labels)
bsize = len(sentences)
length = len(word2id)
sent_tensor = torch.zeros(bsize, length).long()
for sent_id, sent in enumerate(sentences):
for gram in sent:
if gram in word2id:
gram_id = word2id[gram]
sent_tensor[sent_id][gram_id] += 1
return labels, sent_tensor
def collate_fn_dl(word2id, max_len, batch):
"""为DL分类方法提供数据"""
# 根据句子的长度进行排序
batch.sort(key=lambda pair: len(pair[1]), reverse=True)
labels, sentences = zip(*batch)
sentences = [sent[:64] for sent in sentences]
labels = [int(label) for label in labels]
labels = torch.LongTensor(labels)
pad_id = word2id["<pad>"]
unk_id = word2id["<unk>"]
bsize = len(sentences)
max_len = max(len(sentences[0]), max_len)
sent_tensor = torch.ones(bsize, max_len).long() * pad_id
for sent_id, sent in enumerate(sentences):
for word_id, word in enumerate(sent):
sent_tensor[sent_id][word_id] = word2id.get(word, unk_id)
lengths = [len(sent) for sent in sentences]
return labels, sent_tensor, lengths
def preprocess_for_ml(sentences):
# 将字与字之间用空格隔开(分词)
sentences = [" ".join(list(sent)) for sent in sentences]
# 尝试加入二维特征之后,效果反而没有只加一维特征来的要好
# sentences = [" ".join(get_features(sent)) for sent in sentences]
return sentences
def get_features(sent):
"""抽取1-gram 以及 2-gram特征"""
unigrams = list(sent)
bigrams = [sent[i:i+2] for i in range(len(sent)-1)]
return unigrams + bigrams