diff --git a/models/bayes_model.py b/models/bayes_model.py index df6f1a7..267cff2 100644 --- a/models/bayes_model.py +++ b/models/bayes_model.py @@ -39,6 +39,10 @@ def __init__(self, raise FileNotFoundError('Model file `%s` not found.' % self.model_path) def train(self, input_file): + """ + 训练。 + :param input_file: 训练数据集路径 + """ print('`%s` training...' % self.__class__.__name__) train_start = time.time() diff --git a/models/fasttext_model.py b/models/fasttext_model.py index 254e94d..a65a935 100644 --- a/models/fasttext_model.py +++ b/models/fasttext_model.py @@ -51,16 +51,26 @@ def __init__(self, else: raise FileNotFoundError('Model file `%s` not found.' % self.model_path) - def train(self, input_file): + def train(self, train_file): + """ + 训练模型。 + :param train_file: 输入数据集文件路径 + format: label word1 word2 word3 ... + """ print('%s training...' % self.__class__.__name__) train_start = time.time() - self.model = ft.train_supervised(input_file, **self.kwargs) + self.model = ft.train_supervised(train_file, **self.kwargs) self.model.save_model(self.model_path) self.trained = True print('`%s` train finished, time %ss' % (self.__class__.__name__, time.time() - train_start)) def predict(self, doc): + """ + 预测输入文本。 + :param doc: 输入文本数据。 + :return: dict,key: 类别标记, value: 文本输入key类的概率 + """ # 加载预训练模型 if not self.trained: if os.path.exists(self.model_path): diff --git a/models/model_helper.py b/models/model_helper.py index 47a986d..f8df162 100644 --- a/models/model_helper.py +++ b/models/model_helper.py @@ -1,16 +1,12 @@ import codecs -import collections import time import numpy as np import tensorflow as tf import tensorflow.contrib.rnn as rnn import tensorflow.contrib.seq2seq as seq2seq -import tensorflow.contrib.training as tf_training -from tensorflow.contrib.learn import ModeKeys -from utils.iterator import EvalIterator, InferIterator, TrainIterator -from utils.param_utils import get_model_params -from utils.vocabulary import Vocabulary, load_vocab + +from utils.vocabulary import load_vocab # If a vocab size is greater than this value, put the embedding on cpu instead VOCAB_SIZE_THRESHOLD_CPU = 50000 @@ -358,167 +354,3 @@ def _create_attention_images_summary(final_context_state): attention_images *= 255 attention_summary = tf.summary.image("attention_images", attention_images) return attention_summary - - -class TrainModel( - collections.namedtuple("TrainModel", - ("graph", - "model", - "skip_count_placeholder", - "iterator"))): - pass - - -def create_train_model(hparams, - model_creator, - scope=None): - """Create train graph, model, and iterator.""" - print("# Creating TrainModel...") - - src_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix, hparams.src_suffix) - tgt_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix, hparams.tgt_suffix) - src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix) - tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix) - batch_size = hparams.batch_size - num_buckets = hparams.num_buckets - - graph = tf.Graph() - - with graph.as_default(), tf.container(scope or "train"): - skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) - - vocabulary = Vocabulary( - src_vocab_file=src_vocab_file, - tgt_vocab_file=tgt_vocab_file) - - iterator = TrainIterator( - vocabulary=vocabulary, - src_data_file=src_train_file, - tgt_data_file=tgt_train_file, - batch_size=batch_size, - num_buckets=num_buckets, - skip_count=skip_count_placeholder) - - assert isinstance(hparams, tf_training.HParams) - - model_params = get_model_params( - hparams=hparams, - vocabulary=vocabulary, - iterator=iterator) - model_params.add_hparam('mode', ModeKeys.TRAIN) - - model = model_creator(**model_params.values()) - - return TrainModel( - graph=graph, - model=model, - iterator=iterator, - skip_count_placeholder=skip_count_placeholder) - - -class EvalModel( - collections.namedtuple("EvalModel", - ("graph", - "model", - "src_file_placeholder", - "tgt_file_placeholder", - "iterator"))): - pass - - -def create_eval_model(hparams, - model_creator, - scope=None): - """Create eval graph, model, src/tgt file holders, and iterator.""" - print("# Creating EvalModel...") - - src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix) - tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix) - batch_size = hparams.batch_size - num_buckets = hparams.num_buckets - - graph = tf.Graph() - - with graph.as_default(), tf.container(scope or "eval"): - src_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) - tgt_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) - - vocabulary = Vocabulary( - src_vocab_file=src_vocab_file, - tgt_vocab_file=tgt_vocab_file) - - iterator = EvalIterator( - vocabulary=vocabulary, - src_data_file=src_eval_file_placeholder, - tgt_data_file=tgt_eval_file_placeholder, - batch_size=batch_size, - num_buckets=num_buckets) - - assert isinstance(hparams, tf_training.HParams) - - model_params = get_model_params( - hparams=hparams, - vocabulary=vocabulary, - iterator=iterator) - model_params.add_hparam('mode', ModeKeys.EVAL) - - model = model_creator(**model_params.values()) - - return EvalModel( - graph=graph, - model=model, - src_file_placeholder=src_eval_file_placeholder, - tgt_file_placeholder=tgt_eval_file_placeholder, - iterator=iterator) - - -class InferModel( - collections.namedtuple("InferModel", - ("graph", - "model", - "src_data_placeholder", - "batch_size_placeholder", - "iterator"))): - pass - - -def create_infer_model(hparams, - model_creator, - scope=None): - """Create inference model.""" - print("# Creating InferModel...") - - src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix) - tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix) - - graph = tf.Graph() - - with graph.as_default(), tf.container(scope or "infer"): - src_data_placeholder = tf.placeholder(shape=[None], dtype=tf.string) - batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64) - - vocabulary = Vocabulary( - src_vocab_file=src_vocab_file, - tgt_vocab_file=tgt_vocab_file) - - iterator = InferIterator( - vocabulary=vocabulary, - src_data=src_data_placeholder, - batch_size=batch_size_placeholder) - - assert isinstance(hparams, tf_training.HParams) - - model_params = get_model_params( - hparams=hparams, - vocabulary=vocabulary, - iterator=iterator) - model_params.add_hparam('mode', ModeKeys.INFER) - - model = model_creator(**model_params.values()) - - return InferModel( - graph=graph, - model=model, - src_data_placeholder=src_data_placeholder, - batch_size_placeholder=batch_size_placeholder, - iterator=iterator) diff --git a/models/textcnn_model.py b/models/textcnn_model.py index fdef9d7..7f6e8b3 100644 --- a/models/textcnn_model.py +++ b/models/textcnn_model.py @@ -91,14 +91,20 @@ def __init__(self, def _build_model(self): """ - :rtype: Model + TextCNN模型架构,keras实现。 + :rtype: keras Model """ if self.pre_embedding: + # 如果使用预训练的词向量,不需要Embedding层, + # 单个样本输入shape=(sequence_length, embedding_size) + # Input layer. x_input = Input(shape=(self.sequence_length, self.embedding_size), dtype='float32') embedding = x_input else: + # 如果输入为单词id序列,需要添加Embedding层对词进行向量化表示 + # Input layer. x_input = Input(shape=(self.sequence_length,), dtype='int32') @@ -114,6 +120,8 @@ def _build_model(self): pool_layers = [] for size in self.filter_sizes: + # 不同尺度的卷积核 + # Convolution layers. # input: [None, sequence_length, embedding_size] # output: [None, sequence_length - size + 1, num_filters] @@ -155,10 +163,9 @@ def _build_model(self): def train(self, train_file): """ - Train textCNN with input data. - :param input_file: input file path + 训练模型。 + :param train_file: 输入数据集文件路径 format: label word1 word2 word3 ... - :return: """ print('`%s` training...' % self.__class__.__name__) @@ -179,7 +186,12 @@ def train(self, train_file): self.model.save(self.model_path) self.trained = True - def predict(self, line): + def predict(self, doc): + """ + 预测输入文本。 + :param doc: 输入文本数据。 + :return: dict,key: 类别标记, value: 文本输入key类的概率 + """ # 加载预训练模型 if not self.trained: if os.path.exists(self.model_path): @@ -188,7 +200,7 @@ def predict(self, line): else: raise FileNotFoundError('Model file `%s` not found.' % self.model_path) - seg = jieba.cut(line) + seg = jieba.cut(doc) # feature if self.pre_embedding: @@ -207,6 +219,11 @@ def predict(self, line): return res def test(self, test_file): + """ + 对输入数据集的样本进行测试。 + :param test_file: 测试数据集路径 + :return: 测试评估指标 + """ print('`%s` testing...' % self.__class__.__name__) test_start = time.time() diff --git a/requirements.txt b/requirements.txt index 4e5d649..f89aaa4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +tensorflow # Requirements automatically generated by pigar. # https://github.com/damnever/pigar diff --git a/utils/misc_utils.py b/utils/misc_utils.py index ef09838..1cb3c5f 100644 --- a/utils/misc_utils.py +++ b/utils/misc_utils.py @@ -2,6 +2,7 @@ def make_dirs(path): + """保证指定路径的父文件夹存在""" parent_dir = os.path.dirname(path) if not os.path.exists(parent_dir): os.makedirs(parent_dir)