Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
Boreaso committed Sep 7, 2018
1 parent 050d2f8 commit 3ba4741
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 178 deletions.
4 changes: 4 additions & 0 deletions models/bayes_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ def __init__(self,
raise FileNotFoundError('Model file `%s` not found.' % self.model_path)

def train(self, input_file):
"""
训练。
:param input_file: 训练数据集路径
"""
print('`%s` training...' % self.__class__.__name__)

train_start = time.time()
Expand Down
14 changes: 12 additions & 2 deletions models/fasttext_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,26 @@ def __init__(self,
else:
raise FileNotFoundError('Model file `%s` not found.' % self.model_path)

def train(self, input_file):
def train(self, train_file):
"""
训练模型。
:param train_file: 输入数据集文件路径
format: label word1 word2 word3 ...
"""
print('%s training...' % self.__class__.__name__)
train_start = time.time()
self.model = ft.train_supervised(input_file, **self.kwargs)
self.model = ft.train_supervised(train_file, **self.kwargs)
self.model.save_model(self.model_path)
self.trained = True
print('`%s` train finished, time %ss' %
(self.__class__.__name__, time.time() - train_start))

def predict(self, doc):
"""
预测输入文本。
:param doc: 输入文本数据。
:return: dict,key: 类别标记, value: 文本输入key类的概率
"""
# 加载预训练模型
if not self.trained:
if os.path.exists(self.model_path):
Expand Down
172 changes: 2 additions & 170 deletions models/model_helper.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
import codecs
import collections
import time

import numpy as np
import tensorflow as tf
import tensorflow.contrib.rnn as rnn
import tensorflow.contrib.seq2seq as seq2seq
import tensorflow.contrib.training as tf_training
from tensorflow.contrib.learn import ModeKeys
from utils.iterator import EvalIterator, InferIterator, TrainIterator
from utils.param_utils import get_model_params
from utils.vocabulary import Vocabulary, load_vocab

from utils.vocabulary import load_vocab

# If a vocab size is greater than this value, put the embedding on cpu instead
VOCAB_SIZE_THRESHOLD_CPU = 50000
Expand Down Expand Up @@ -358,167 +354,3 @@ def _create_attention_images_summary(final_context_state):
attention_images *= 255
attention_summary = tf.summary.image("attention_images", attention_images)
return attention_summary


class TrainModel(
collections.namedtuple("TrainModel",
("graph",
"model",
"skip_count_placeholder",
"iterator"))):
pass


def create_train_model(hparams,
model_creator,
scope=None):
"""Create train graph, model, and iterator."""
print("# Creating TrainModel...")

src_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix, hparams.src_suffix)
tgt_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix, hparams.tgt_suffix)
src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix)
tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix)
batch_size = hparams.batch_size
num_buckets = hparams.num_buckets

graph = tf.Graph()

with graph.as_default(), tf.container(scope or "train"):
skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64)

vocabulary = Vocabulary(
src_vocab_file=src_vocab_file,
tgt_vocab_file=tgt_vocab_file)

iterator = TrainIterator(
vocabulary=vocabulary,
src_data_file=src_train_file,
tgt_data_file=tgt_train_file,
batch_size=batch_size,
num_buckets=num_buckets,
skip_count=skip_count_placeholder)

assert isinstance(hparams, tf_training.HParams)

model_params = get_model_params(
hparams=hparams,
vocabulary=vocabulary,
iterator=iterator)
model_params.add_hparam('mode', ModeKeys.TRAIN)

model = model_creator(**model_params.values())

return TrainModel(
graph=graph,
model=model,
iterator=iterator,
skip_count_placeholder=skip_count_placeholder)


class EvalModel(
collections.namedtuple("EvalModel",
("graph",
"model",
"src_file_placeholder",
"tgt_file_placeholder",
"iterator"))):
pass


def create_eval_model(hparams,
model_creator,
scope=None):
"""Create eval graph, model, src/tgt file holders, and iterator."""
print("# Creating EvalModel...")

src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix)
tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix)
batch_size = hparams.batch_size
num_buckets = hparams.num_buckets

graph = tf.Graph()

with graph.as_default(), tf.container(scope or "eval"):
src_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
tgt_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)

vocabulary = Vocabulary(
src_vocab_file=src_vocab_file,
tgt_vocab_file=tgt_vocab_file)

iterator = EvalIterator(
vocabulary=vocabulary,
src_data_file=src_eval_file_placeholder,
tgt_data_file=tgt_eval_file_placeholder,
batch_size=batch_size,
num_buckets=num_buckets)

assert isinstance(hparams, tf_training.HParams)

model_params = get_model_params(
hparams=hparams,
vocabulary=vocabulary,
iterator=iterator)
model_params.add_hparam('mode', ModeKeys.EVAL)

model = model_creator(**model_params.values())

return EvalModel(
graph=graph,
model=model,
src_file_placeholder=src_eval_file_placeholder,
tgt_file_placeholder=tgt_eval_file_placeholder,
iterator=iterator)


class InferModel(
collections.namedtuple("InferModel",
("graph",
"model",
"src_data_placeholder",
"batch_size_placeholder",
"iterator"))):
pass


def create_infer_model(hparams,
model_creator,
scope=None):
"""Create inference model."""
print("# Creating InferModel...")

src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix)
tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix)

graph = tf.Graph()

with graph.as_default(), tf.container(scope or "infer"):
src_data_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64)

vocabulary = Vocabulary(
src_vocab_file=src_vocab_file,
tgt_vocab_file=tgt_vocab_file)

iterator = InferIterator(
vocabulary=vocabulary,
src_data=src_data_placeholder,
batch_size=batch_size_placeholder)

assert isinstance(hparams, tf_training.HParams)

model_params = get_model_params(
hparams=hparams,
vocabulary=vocabulary,
iterator=iterator)
model_params.add_hparam('mode', ModeKeys.INFER)

model = model_creator(**model_params.values())

return InferModel(
graph=graph,
model=model,
src_data_placeholder=src_data_placeholder,
batch_size_placeholder=batch_size_placeholder,
iterator=iterator)
29 changes: 23 additions & 6 deletions models/textcnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,20 @@ def __init__(self,

def _build_model(self):
"""
:rtype: Model
TextCNN模型架构,keras实现。
:rtype: keras Model
"""
if self.pre_embedding:
# 如果使用预训练的词向量,不需要Embedding层,
# 单个样本输入shape=(sequence_length, embedding_size)

# Input layer.
x_input = Input(shape=(self.sequence_length, self.embedding_size),
dtype='float32')
embedding = x_input
else:
# 如果输入为单词id序列,需要添加Embedding层对词进行向量化表示

# Input layer.
x_input = Input(shape=(self.sequence_length,), dtype='int32')

Expand All @@ -114,6 +120,8 @@ def _build_model(self):
pool_layers = []

for size in self.filter_sizes:
# 不同尺度的卷积核

# Convolution layers.
# input: [None, sequence_length, embedding_size]
# output: [None, sequence_length - size + 1, num_filters]
Expand Down Expand Up @@ -155,10 +163,9 @@ def _build_model(self):

def train(self, train_file):
"""
Train textCNN with input data.
:param input_file: input file path
训练模型。
:param train_file: 输入数据集文件路径
format: label word1 word2 word3 ...
:return:
"""
print('`%s` training...' % self.__class__.__name__)

Expand All @@ -179,7 +186,12 @@ def train(self, train_file):
self.model.save(self.model_path)
self.trained = True

def predict(self, line):
def predict(self, doc):
"""
预测输入文本。
:param doc: 输入文本数据。
:return: dict,key: 类别标记, value: 文本输入key类的概率
"""
# 加载预训练模型
if not self.trained:
if os.path.exists(self.model_path):
Expand All @@ -188,7 +200,7 @@ def predict(self, line):
else:
raise FileNotFoundError('Model file `%s` not found.' % self.model_path)

seg = jieba.cut(line)
seg = jieba.cut(doc)

# feature
if self.pre_embedding:
Expand All @@ -207,6 +219,11 @@ def predict(self, line):
return res

def test(self, test_file):
"""
对输入数据集的样本进行测试。
:param test_file: 测试数据集路径
:return: 测试评估指标
"""
print('`%s` testing...' % self.__class__.__name__)

test_start = time.time()
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
tensorflow
# Requirements automatically generated by pigar.
# https://github.com/damnever/pigar

Expand Down
1 change: 1 addition & 0 deletions utils/misc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


def make_dirs(path):
"""保证指定路径的父文件夹存在"""
parent_dir = os.path.dirname(path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)

0 comments on commit 3ba4741

Please sign in to comment.