-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
158 lines (128 loc) · 5.38 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import pickle
import time
from datetime import timedelta
import numpy as np
import math
import json
import torch
from torch.nn import functional as F
from torch.utils.data import Dataset
from config import Config
def load_embeddings(word2vec_file):
with open(word2vec_file, encoding='utf-8') as f:
word_emb = list()
word_dict = dict()
word_emb.append([0])
word_dict['<UNK>'] = 0
for line in f.readlines():
tokens = line.split(' ')
word_emb.append([float(i) for i in tokens[1:]])
word_dict[tokens[0]] = len(word_dict)+1
word_emb[0] = [0] * len(word_emb[1])
return word_emb, word_dict
# 如果首行有维度表示,则需要跳过第一行。
def load_embedding(embeddings_file):
with open(embeddings_file, 'r', encoding='utf-8') as f:
word_emb = list()
word_dict = dict()
word_emb.append([0])
word_dict['<UNK>'] = 0
lines = f.readlines()
lines = [l.strip() for l in lines]
lines = lines[1:]
for line in lines:
tokens = line.split(' ')
word_emb.append([float(i) for i in tokens[1:]])
word_dict[tokens[0]] = len(word_dict) + 1
word_emb[0] = [0] * len(word_emb[1])
return word_emb, word_dict
def date(f='%Y-%m-%d %H:%M:%S'):
return time.strftime(f, time.localtime())
def get_time_dif(start_time):
'''
获取已经使用时间
:param start_time:
:return:
'''
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(time_dif))
class MPCNDataset(Dataset):
def __init__(self, data_path, word_dict, emotion_dict, config, retain_rui=True):
self.content_count = config.content_count
self.content_length = config.content_length
self.comment_count = config.comment_count
self.lowest_r_count = config.lowest_review_count # lowest amount of reviews wrote by exactly one user/item
self.review_length = config.review_length
self.PAD_idx = word_dict[config.PAD_WORD]
self.emo_idx = emotion_dict[config.PAD_WORD]
label2idx = {'false': 0, 'true': 1}
contents = []
comments = []
label = []
# stopwords = [line.strip() for line in open('data/stopwords.txt',encoding='UTF-8').readlines()]
split_dataset = [json.load(open(os.path.join(data_path), 'r', encoding='utf-8'))] # 打开.json的文件
split_dataset = dict(zip(['twitter15'], split_dataset))
for p in split_dataset['twitter15']:
contents.append([p['content']])
label.append(label2idx[p['label']])
comments.append(p['comments'])
reviews = []
for i in range(len(comments)):
b = []
for j in range(len(comments[i])):
a = []
for w in str(comments[i][j]).split():
# if w not in stopwords:
a.append(word_dict.get(w, self.emo_idx))
b.append(a)
reviews.append(b)
content = []
for i in range(len(contents)):
b = []
for j in range(len(contents[i])):
a = []
for w in str(contents[i][j]).split():
# if w not in stopwords:
a.append(word_dict.get(w, self.PAD_idx))
b.append(a)
content.append(b)
post, comments = self._get_content_reviews(content, reviews) # Gather reviews for user
retain_idx = [idx for idx in range(post.shape[0])]
self.user_post = post[retain_idx]
self.user_comments = comments[retain_idx]
self.rating = torch.Tensor(label)[retain_idx]
def __getitem__(self, idx):
return self.user_post[idx], self.user_comments[idx], self.rating[idx]
def __len__(self):
return self.rating.shape[0]
def _get_content_reviews(self, content, reviews):
group_reviews = []
group_content = []
for i in range(len(reviews)):
pad_reviews = self._pad_reviews(reviews[i])
pad_content = self._pad_contetn((content[i]))
group_reviews.append(pad_reviews)
group_content.append(pad_content)
return torch.LongTensor(group_content), torch.LongTensor(group_reviews)
def _pad_reviews(self, reviews):
count, length = self.comment_count, self.review_length
reviews = reviews[:count] + [[self.emo_idx] * length] * (count - len(reviews)) # Certain count.
reviews = [r[:length] + [0] * (length - len(r)) for r in reviews] # Certain length of review.
return reviews
def _pad_contetn(self, content):
# print(content)
content_count, content_length = self.content_count, self.content_length
reviews = content[:content_count] + [[self.PAD_idx] * content_length] * (
content_count - len(content)) # Certain count.
reviews = [r[:content_length] + [0] * (content_length - len(r)) for r in reviews] # Certain length of review.
return reviews
if __name__ == '__main__':
config = Config()
print(f'{date()}## Load word2vec and data...')
word_emb, word_dict = load_embeddings(config.word2vec_file)
#print(word_emb[0:3])
emo_emb, emo_dict = load_embedding(config.emotion_file)
#print(emo_emb[0:3])
train_dataset = MPCNDataset(config.train_file, word_dict, emo_dict, config)