Initial commit.

Boreaso · Sep 7, 2018 · 3ba4741 · 3ba4741
1 parent 050d2f8
commit 3ba4741
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 178 deletions.
diff --git a/models/bayes_model.py b/models/bayes_model.py
@@ -39,6 +39,10 @@ def __init__(self,
                 raise FileNotFoundError('Model file `%s` not found.' % self.model_path)
 
     def train(self, input_file):
+        """
+        训练。
+        :param input_file: 训练数据集路径
+        """
         print('`%s` training...' % self.__class__.__name__)
 
         train_start = time.time()

diff --git a/models/fasttext_model.py b/models/fasttext_model.py
@@ -51,16 +51,26 @@ def __init__(self,
             else:
                 raise FileNotFoundError('Model file `%s` not found.' % self.model_path)
 
-    def train(self, input_file):
+    def train(self, train_file):
+        """
+        训练模型。
+        :param train_file: 输入数据集文件路径
+            format: label word1 word2 word3 ...
+        """
         print('%s training...' % self.__class__.__name__)
         train_start = time.time()
-        self.model = ft.train_supervised(input_file, **self.kwargs)
+        self.model = ft.train_supervised(train_file, **self.kwargs)
         self.model.save_model(self.model_path)
         self.trained = True
         print('`%s` train finished, time %ss' %
               (self.__class__.__name__, time.time() - train_start))
 
     def predict(self, doc):
+        """
+        预测输入文本。
+        :param doc: 输入文本数据。
+        :return: dict，key: 类别标记， value: 文本输入key类的概率
+        """
         # 加载预训练模型
         if not self.trained:
             if os.path.exists(self.model_path):

diff --git a/models/model_helper.py b/models/model_helper.py
@@ -1,16 +1,12 @@
 import codecs
-import collections
 import time
 
 import numpy as np
 import tensorflow as tf
 import tensorflow.contrib.rnn as rnn
 import tensorflow.contrib.seq2seq as seq2seq
-import tensorflow.contrib.training as tf_training
-from tensorflow.contrib.learn import ModeKeys
-from utils.iterator import EvalIterator, InferIterator, TrainIterator
-from utils.param_utils import get_model_params
-from utils.vocabulary import Vocabulary, load_vocab
+
+from utils.vocabulary import load_vocab
 
 # If a vocab size is greater than this value, put the embedding on cpu instead
 VOCAB_SIZE_THRESHOLD_CPU = 50000
@@ -358,167 +354,3 @@ def _create_attention_images_summary(final_context_state):
     attention_images *= 255
     attention_summary = tf.summary.image("attention_images", attention_images)
     return attention_summary
-
-
-class TrainModel(
-    collections.namedtuple("TrainModel",
-                           ("graph",
-                            "model",
-                            "skip_count_placeholder",
-                            "iterator"))):
-    pass
-
-
-def create_train_model(hparams,
-                       model_creator,
-                       scope=None):
-    """Create train graph, model, and iterator."""
-    print("# Creating TrainModel...")
-
-    src_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix, hparams.src_suffix)
-    tgt_train_file = "%s/%s.%s" % (hparams.data_dir, hparams.train_prefix, hparams.tgt_suffix)
-    src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix)
-    tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix)
-    batch_size = hparams.batch_size
-    num_buckets = hparams.num_buckets
-
-    graph = tf.Graph()
-
-    with graph.as_default(), tf.container(scope or "train"):
-        skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64)
-
-        vocabulary = Vocabulary(
-            src_vocab_file=src_vocab_file,
-            tgt_vocab_file=tgt_vocab_file)
-
-        iterator = TrainIterator(
-            vocabulary=vocabulary,
-            src_data_file=src_train_file,
-            tgt_data_file=tgt_train_file,
-            batch_size=batch_size,
-            num_buckets=num_buckets,
-            skip_count=skip_count_placeholder)
-
-        assert isinstance(hparams, tf_training.HParams)
-
-        model_params = get_model_params(
-            hparams=hparams,
-            vocabulary=vocabulary,
-            iterator=iterator)
-        model_params.add_hparam('mode', ModeKeys.TRAIN)
-
-        model = model_creator(**model_params.values())
-
-    return TrainModel(
-        graph=graph,
-        model=model,
-        iterator=iterator,
-        skip_count_placeholder=skip_count_placeholder)
-
-
-class EvalModel(
-    collections.namedtuple("EvalModel",
-                           ("graph",
-                            "model",
-                            "src_file_placeholder",
-                            "tgt_file_placeholder",
-                            "iterator"))):
-    pass
-
-
-def create_eval_model(hparams,
-                      model_creator,
-                      scope=None):
-    """Create eval graph, model, src/tgt file holders, and iterator."""
-    print("# Creating EvalModel...")
-
-    src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix)
-    tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix)
-    batch_size = hparams.batch_size
-    num_buckets = hparams.num_buckets
-
-    graph = tf.Graph()
-
-    with graph.as_default(), tf.container(scope or "eval"):
-        src_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
-        tgt_eval_file_placeholder = tf.placeholder(shape=(), dtype=tf.string)
-
-        vocabulary = Vocabulary(
-            src_vocab_file=src_vocab_file,
-            tgt_vocab_file=tgt_vocab_file)
-
-        iterator = EvalIterator(
-            vocabulary=vocabulary,
-            src_data_file=src_eval_file_placeholder,
-            tgt_data_file=tgt_eval_file_placeholder,
-            batch_size=batch_size,
-            num_buckets=num_buckets)
-
-        assert isinstance(hparams, tf_training.HParams)
-
-        model_params = get_model_params(
-            hparams=hparams,
-            vocabulary=vocabulary,
-            iterator=iterator)
-        model_params.add_hparam('mode', ModeKeys.EVAL)
-
-        model = model_creator(**model_params.values())
-
-    return EvalModel(
-        graph=graph,
-        model=model,
-        src_file_placeholder=src_eval_file_placeholder,
-        tgt_file_placeholder=tgt_eval_file_placeholder,
-        iterator=iterator)
-
-
-class InferModel(
-    collections.namedtuple("InferModel",
-                           ("graph",
-                            "model",
-                            "src_data_placeholder",
-                            "batch_size_placeholder",
-                            "iterator"))):
-    pass
-
-
-def create_infer_model(hparams,
-                       model_creator,
-                       scope=None):
-    """Create inference model."""
-    print("# Creating InferModel...")
-
-    src_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.src_suffix)
-    tgt_vocab_file = "%s/%s.%s" % (hparams.data_dir, hparams.vocab_prefix, hparams.tgt_suffix)
-
-    graph = tf.Graph()
-
-    with graph.as_default(), tf.container(scope or "infer"):
-        src_data_placeholder = tf.placeholder(shape=[None], dtype=tf.string)
-        batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64)
-
-        vocabulary = Vocabulary(
-            src_vocab_file=src_vocab_file,
-            tgt_vocab_file=tgt_vocab_file)
-
-        iterator = InferIterator(
-            vocabulary=vocabulary,
-            src_data=src_data_placeholder,
-            batch_size=batch_size_placeholder)
-
-        assert isinstance(hparams, tf_training.HParams)
-
-        model_params = get_model_params(
-            hparams=hparams,
-            vocabulary=vocabulary,
-            iterator=iterator)
-        model_params.add_hparam('mode', ModeKeys.INFER)
-
-        model = model_creator(**model_params.values())
-
-    return InferModel(
-        graph=graph,
-        model=model,
-        src_data_placeholder=src_data_placeholder,
-        batch_size_placeholder=batch_size_placeholder,
-        iterator=iterator)
diff --git a/models/textcnn_model.py b/models/textcnn_model.py
@@ -91,14 +91,20 @@ def __init__(self,
 
     def _build_model(self):
         """
-        :rtype: Model
+        TextCNN模型架构，keras实现。
+        :rtype: keras Model
         """
         if self.pre_embedding:
+            # 如果使用预训练的词向量，不需要Embedding层，
+            # 单个样本输入shape=(sequence_length, embedding_size)
+
             # Input layer.
             x_input = Input(shape=(self.sequence_length, self.embedding_size),
                             dtype='float32')
             embedding = x_input
         else:
+            # 如果输入为单词id序列，需要添加Embedding层对词进行向量化表示
+
             # Input layer.
             x_input = Input(shape=(self.sequence_length,), dtype='int32')
 
@@ -114,6 +120,8 @@ def _build_model(self):
         pool_layers = []
 
         for size in self.filter_sizes:
+            # 不同尺度的卷积核
+
             # Convolution layers.
             # input: [None, sequence_length, embedding_size]
             # output: [None, sequence_length - size + 1, num_filters]
@@ -155,10 +163,9 @@ def _build_model(self):
 
     def train(self, train_file):
         """
-        Train textCNN with input data.
-        :param input_file: input file path
+        训练模型。
+        :param train_file: 输入数据集文件路径
         format: label word1 word2 word3 ...
-        :return:
         """
         print('`%s` training...' % self.__class__.__name__)
 
@@ -179,7 +186,12 @@ def train(self, train_file):
         self.model.save(self.model_path)
         self.trained = True
 
-    def predict(self, line):
+    def predict(self, doc):
+        """
+        预测输入文本。
+        :param doc: 输入文本数据。
+        :return: dict，key: 类别标记， value: 文本输入key类的概率
+        """
         # 加载预训练模型
         if not self.trained:
             if os.path.exists(self.model_path):
@@ -188,7 +200,7 @@ def predict(self, line):
             else:
                 raise FileNotFoundError('Model file `%s` not found.' % self.model_path)
 
-        seg = jieba.cut(line)
+        seg = jieba.cut(doc)
 
         # feature
         if self.pre_embedding:
@@ -207,6 +219,11 @@ def predict(self, line):
         return res
 
     def test(self, test_file):
+        """
+        对输入数据集的样本进行测试。
+        :param test_file: 测试数据集路径
+        :return: 测试评估指标
+        """
         print('`%s` testing...' % self.__class__.__name__)
 
         test_start = time.time()

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+tensorflow
 # Requirements automatically generated by pigar.
 # https://github.com/damnever/pigar
 

diff --git a/utils/misc_utils.py b/utils/misc_utils.py
@@ -2,6 +2,7 @@
 
 
 def make_dirs(path):
+    """保证指定路径的父文件夹存在"""
     parent_dir = os.path.dirname(path)
     if not os.path.exists(parent_dir):
         os.makedirs(parent_dir)