-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocab_utils.py
80 lines (69 loc) · 2.71 KB
/
vocab_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
from __future__ import print_function
import numpy as np
import re
import os
# import math
class Vocab(object):
def __init__(self, pattern):
self.pattern = pattern
def patternWord(self, train_path="", model_dir=""):
vec_path = train_path
self.word2id = {}
self.id2word = {}
vec_file = open(vec_path, 'rt')
word_vecs = {}
for line in vec_file:
line = line.strip()
parts = line.split(' ')
word = parts[0]
self.word_dim = len(parts[1:])
if self.word_dim < 128: continue
vector = np.array(parts[1:], dtype='float32')
cur_index = len(self.word2id)
self.word2id[word] = cur_index
self.id2word[cur_index] = word
word_vecs[cur_index] = vector
vec_file.close()
cur_index = len(self.word2id)
self.word2id['<UNK/>'] = cur_index
self.id2word[cur_index] = '<UNK/>'
word_vecs[cur_index] = np.random.normal(0, 1, size=(self.word_dim,))
self.vocab_size = len(self.word2id)
self.word_vecs = np.zeros((self.vocab_size, self.word_dim),
dtype=np.float32)
for cur_index in iter(range(self.vocab_size)):
self.word_vecs[cur_index][:len(word_vecs[cur_index])] = word_vecs[cur_index]
word2id_path = model_dir + "/word2id.txt"
print("word2id path:", word2id_path)
with open(word2id_path, "w") as out_op:
for word in self.word2id:
out_op.write(word + "\t" + str(self.word2id[word]) + "\n")
def patternLabel(self, voc, label_path=""):
self.word2id = {}
self.id2word = {}
self.vocab_size = len(voc) # voc=all_chars
self.word_dim = 2
for word in voc:
cur_index = len(self.word2id)
self.word2id[word] = cur_index
self.id2word[cur_index] = word
shape = (self.vocab_size, self.word_dim)
scale = 0.05
self.word_vecs = np.array(np.random.uniform(low=-scale, high=scale, size=shape), dtype=np.float32)
with open(label_path, "w") as out_op:
for id in self.id2word:
wordVec = ""
for v in self.word_vecs[id]:
wordVec += "{} ".format(v)
out_op.write(str(id) + "\t" + self.id2word[id] + "\t" + wordVec + "\n")
def to_index_sequence(self, sentence):
sentence = sentence.strip()
seq = []
for word in re.split('\\s+', sentence):
if word in self.word2id:
idx = self.word2id[word]
else:
idx = self.word2id['<UNK/>']
seq.append(idx)
return seq