forked from PaddlePaddle/PaddleHub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
module.py
161 lines (138 loc) · 6.09 KB
/
module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import math
import os
import six
import paddle.fluid as fluid
import paddlehub as hub
from paddlehub.common.paddle_helper import get_variable_info
from paddlehub.module.module import moduleinfo, serving
from paddlehub.reader import tokenization
from porn_detection_gru.processor import load_vocab, preprocess, postprocess
@moduleinfo(
name="porn_detection_gru",
version="1.1.0",
summary="Baidu's open-source Porn Detection Model.",
author="baidu-nlp",
author_email="",
type="nlp/sentiment_analysis")
class PornDetectionGRU(hub.NLPPredictionModule):
def _initialize(self):
"""
initialize with the necessary elements
"""
self.pretrained_model_path = os.path.join(self.directory, "infer_model")
self.tokenizer_vocab_path = os.path.join(self.directory, "assets", "vocab.txt")
self.vocab_path = os.path.join(self.directory, "assets", "word_dict.txt")
self.vocab = load_vocab(self.vocab_path)
self.sequence_max_len = 256
self.tokenizer = tokenization.FullTokenizer(self.tokenizer_vocab_path)
self.param_file = os.path.join(self.directory, "assets", "params.txt")
self.predict = self.detection
self._set_config()
def context(self, trainable=False):
"""
Get the input ,output and program of the pretrained porn_detection_gru
Args:
trainable(bool): whether fine-tune the pretrained parameters of porn_detection_gru or not
Returns:
inputs(dict): the input variables of porn_detection_gru (words)
outputs(dict): the output variables of porn_detection_gru (the sentiment prediction results)
main_program(Program): the main_program of lac with pretrained prameters
"""
place = fluid.CPUPlace()
exe = fluid.Executor(place)
program, feed_target_names, fetch_targets = fluid.io.load_inference_model(
dirname=self.pretrained_model_path, executor=exe)
with open(self.param_file, 'r') as file:
params_list = file.readlines()
for param in params_list:
param = param.strip()
var = program.global_block().var(param)
var_info = get_variable_info(var)
program.global_block().create_parameter(
shape=var_info['shape'], dtype=var_info['dtype'], name=var_info['name'])
for param in program.global_block().iter_parameters():
param.trainable = trainable
for name, var in program.global_block().vars.items():
if name == feed_target_names[0]:
inputs = {"words": var}
# output of sencond layer from the end prediction layer (fc-softmax)
if name == "@HUB_porn_detection_gru@layer_norm_0.tmp_2":
outputs = {"class_probs": fetch_targets[0], "sentence_feature": var}
return inputs, outputs, program
@serving
def detection(self, texts=[], data={}, use_gpu=False, batch_size=1):
"""
Get the porn prediction results results with the texts as input
Args:
texts(list): the input texts to be predicted, if texts not data
data(dict): key must be 'text', value is the texts to be predicted, if data not texts
use_gpu(bool): whether use gpu to predict or not
batch_size(int): the program deals once with one batch
Returns:
results(list): the porn prediction results
"""
try:
_places = os.environ["CUDA_VISIBLE_DEVICES"]
int(_places[0])
except:
use_gpu = False
if texts != [] and isinstance(texts, list) and data == {}:
predicted_data = texts
elif texts == [] and isinstance(data, dict) and isinstance(data.get('text', None), list) and data['text']:
predicted_data = data["text"]
else:
raise ValueError("The input data is inconsistent with expectations.")
predicted_data = self.to_unicode(predicted_data)
start_idx = 0
iteration = int(math.ceil(len(predicted_data) / batch_size))
results = []
for i in range(iteration):
if i < (iteration - 1):
batch_data = predicted_data[start_idx:(start_idx + batch_size)]
else:
batch_data = predicted_data[start_idx:]
start_idx = start_idx + batch_size
processed_results = preprocess(batch_data, self.tokenizer, self.vocab, self.sequence_max_len)
tensor_words = self.texts2tensor(processed_results)
if use_gpu:
batch_out = self.gpu_predictor.run([tensor_words])
else:
batch_out = self.cpu_predictor.run([tensor_words])
batch_result = postprocess(batch_out[0], processed_results)
results += batch_result
return results
def get_labels(self):
"""
Get the labels which was used when pretraining
Returns:
self.labels(dict)
"""
self.labels = {"porn": 1, "not_porn": 0}
return self.labels
if __name__ == "__main__":
porn_detection_gru = PornDetectionGRU()
porn_detection_gru.context()
# porn_detection_gru = hub.Module(name='porn_detection_gru')
test_text = ["黄片下载", "打击黄牛党"]
results = porn_detection_gru.detection(texts=test_text)
for index, text in enumerate(test_text):
results[index]["text"] = text
for index, result in enumerate(results):
if six.PY2:
print(json.dumps(results[index], encoding="utf8", ensure_ascii=False))
else:
print(results[index])
input_dict = {"text": test_text}
results = porn_detection_gru.detection(data=input_dict)
for index, text in enumerate(test_text):
results[index]["text"] = text
for index, result in enumerate(results):
if six.PY2:
print(json.dumps(results[index], encoding="utf8", ensure_ascii=False))
else:
print(results[index])