Merge pull request #42 from amansrivastava17/hub-module-fast

Making module faster
amansrivastava17 · Dec 29, 2019 · 4ddb127 · 4ddb127
2 parents 63cf57f + baccc3a
commit 4ddb127
Show file tree

Hide file tree

Showing 13 changed files with 222 additions and 108 deletions.
diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ $ pip install embedding-as-service
 #### 1. **Intialise encoder using supported embedding** and models from <a href="#-supported-embeddings-and-models">here</a>  
 ```python  
 >>> from embedding_as_service.text.encode import Encoder  
->>> en = Encoder(embedding='bert', model='bert_base_cased', download=True)  
+>>> en = Encoder(embedding='bert', model='bert_base_cased')  
 ```  
 #### 2. Get sentences **tokens embedding**  
 ```python 
@@ -99,8 +99,9 @@ array([[-0.33547154,  0.34566957,  1.1954105 , ...,  0.33702594,
 ```  
 
 #### 4. Use custom `max_seq_length`, default is 128  
-```python  
->>> vecs = en.encode(texts=['hello aman', 'how are you?'], max_seq_length=256)  
+```python 
+>>> en = Encoder(embedding='bert', model='bert_base_cased', max_seq_length=256)  
+>>> vecs = en.encode(texts=['hello aman', 'how are you?'])  
 >>> vecs  
 array([[ 0.48388457, -0.01327741, -0.76577514, ..., -0.54265064,  
  -0.5564591 ,  0.6454179 ], [ 0.53209245,  0.00526248, -0.71091074, ..., -0.5171917 , -0.40458363,  0.6779779 ]], dtype=float32)  
@@ -131,15 +132,14 @@ array([[ 0.48388457, -0.01327741, -0.76577514, ..., -0.54265064,
 |--------------------|------|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `embedding` | str | *Required* | embedding method to be used, check `Embedding` column <a href="#-supported-embeddings-and-models">here</a>|
 | `model`| str |*Required*| Model to be used for mentioned embedding, check `Model` column <a href="#-supported-embeddings-and-models">here</a>|
-| `download`| bool |`False`| Download model if model does not exists|
+| `max_seq_length`| int |128| Maximum Sequence Length, default is 128|
 
 2. **def** <span style="color:blue">`embedding_as_service.text.encoder.Encoder.encode`</span>
 
   | Argument | Type | Default | Description |
 |--------------------|------|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | `Texts` | List[str] or List[List[str]] | *Required* | List of sentences or list of list of sentence tokens in case of `is_tokenized=True`
 | `pooling`| str |(Optional)| Pooling methods to apply, <a href="#-pooling-strategies-">here</a> is available methods|
-| `max_seq_length`| int | `128` | Maximum Sequence Length, default is 128|
 | `is_tokenized` | bool | `False` | set as True in case of tokens are passed for encoding |  
 | `batch_size` | int | `128` | maximum number of sequences handled by encoder, larger batch will be partitioned into small batches. |
 
@@ -211,9 +211,9 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
   <tr>
     <td align="center"><a href="https://www.linkedin.com/in/aman-srivastava-a8bb1285/"><img src="https://avatars0.githubusercontent.com/u/5950398?v=4" width="100px;" alt="Aman Srivastava"/><br /><sub><b>Aman Srivastava</b></sub></a><br /><a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=amansrivastava17" title="Code">💻</a> <a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=amansrivastava17" title="Documentation">📖</a> <a href="#infra-amansrivastava17" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a></td>
     <td align="center"><a href="https://github.com/ashutoshsingh0223"><img src="https://avatars3.githubusercontent.com/u/40604544?v=4" width="100px;" alt="Ashutosh Singh"/><br /><sub><b>Ashutosh Singh</b></sub></a><br /><a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=ashutoshsingh0223" title="Code">💻</a> <a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=ashutoshsingh0223" title="Documentation">📖</a> <a href="#infra-ashutoshsingh0223" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a></td>
+    <td align="center"><a href="https://chiragjn.github.io"><img src="https://avatars2.githubusercontent.com/u/10295418?v=4" width="100px;" alt="Chirag Jain"/><br /><sub><b>Chirag Jain</b></sub></a><br /><a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=chiragjn" title="Code">💻</a> <a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=chiragjn" title="Documentation">📖</a> <a href="#infra-chiragjn" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a></td>
     <td align="center"><a href="https://github.com/MrPranav101"><img src="https://avatars0.githubusercontent.com/u/43914392?v=4" width="100px;" alt="MrPranav101"/><br /><sub><b>MrPranav101</b></sub></a><br /><a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=MrPranav101" title="Code">💻</a> <a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=MrPranav101" title="Documentation">📖</a> <a href="#infra-MrPranav101" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a></td>    
     <td align="center"><a href="https://www.linkedin.com/in/dhavaltaunk08/"><img src="https://avatars0.githubusercontent.com/u/31320833?v=4" width="100px;" alt="Dhaval Taunk"/><br /><sub><b>Dhaval Taunk</b></sub></a><br /><a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=DhavalTaunk08" title="Code">💻</a> <a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=DhavalTaunk08" title="Documentation">📖</a> <a href="#infra-DhavalTaunk08" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a></td>
-    <td align="center"><a href="https://chiragjn.github.io"><img src="https://avatars2.githubusercontent.com/u/10295418?v=4" width="100px;" alt="Chirag Jain"/><br /><sub><b>Chirag Jain</b></sub></a><br /><a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=chiragjn" title="Code">💻</a> <a href="https://github.com/amansrivastava17/embedding-as-service/commits?author=chiragjn" title="Documentation">📖</a> <a href="#infra-chiragjn" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a></td>
   </tr>
 </table>
 

diff --git a/embedding_as_service/text/albert/__init__.py b/embedding_as_service/text/albert/__init__.py
@@ -60,12 +60,18 @@ class Embeddings(object):
 
     def __init__(self):
         self.sess = tf.Session()
-        self.albert_module = None
+        self.albert_outputs = None
         self.model_name = None
+        self.max_seq_length = None
+
+        # placeholder definition
+        self.input_ids = None
+        self.input_masks = None
+        self.segment_ids = None
 
     def create_tokenizer_from_hub_module(self):
         """Get the vocab file and casing info from the Hub module."""
-        tokenization_info = self.albert_module(signature="tokenization_info", as_dict=True)
+        tokenization_info = self.albert_outputs(signature="tokenization_info", as_dict=True)
 
         sentence_piece_file, do_lower_case = self.sess.run([tokenization_info["vocab_file"],
                                                    tokenization_info["do_lower_case"]])
@@ -78,9 +84,9 @@ def create_tokenizer_from_hub_module(self):
     def tokenize(cls, text):
         return cls.tokenizer.tokenize(text)
 
-    @staticmethod
-    def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool = False
+    def _model_single_input(self, text: Union[str, List[str]], is_tokenized: bool = False
                             ) -> Tuple[List[int], List[int], List[int]]:
+        max_seq_length = self.max_seq_length
         tokens_a = text
         if not is_tokenized:
             tokens_a = Embeddings.tokenize(text)
@@ -115,35 +121,54 @@ def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tok
 
         return input_ids, input_mask, segment_ids
 
-    def load_model(self, model: str, model_path: str):
-        self.albert_module = hub.Module(model_path)
-        self.sess.run(tf.initializers.global_variables())
-        self.create_tokenizer_from_hub_module()
+    def load_model(self, model: str, model_path: str, max_seq_length: int):
+        g = tf.Graph()
+        with g.as_default():
+            self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+            self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+            self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+
+            hub_module = hub.Module(model_path)
+            albert_inputs = dict(
+                input_ids=self.input_ids,
+                input_mask=self.input_masks,
+                segment_ids=self.segment_ids
+            )
+            self.albert_outputs = hub_module(albert_inputs, signature="tokens", as_dict=True)
+            tokenization_info = hub_module(signature="tokenization_info", as_dict=True)
+            init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
+        g.finalize()
+        self.sess = tf.Session(graph=g)
+        self.sess.run(init_op)
+        sentence_piece_file, do_lower_case = self.sess.run(
+            [tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]
+        )
+        Embeddings.tokenizer = FullTokenizer(vocab_file=None,
+                                             do_lower_case=do_lower_case,
+                                             spm_model_file=sentence_piece_file)
+        self.max_seq_length = max_seq_length
         self.model_name = model
         print("Model loaded Successfully !")
 
     def encode(self, texts: Union[List[str], List[List[str]]],
                pooling: str,
-               max_seq_length: int,
                is_tokenized: bool = False,
                **kwargs
                ) -> Optional[np.array]:
         input_ids, input_masks, segment_ids = [], [], []
         for text in tqdm(texts, desc="Converting texts to features"):
-            input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length, is_tokenized)
+            input_id, input_mask, segment_id = self._model_single_input(text,  is_tokenized)
             input_ids.append(input_id)
             input_masks.append(input_mask)
             segment_ids.append(segment_id)
 
-        albert_inputs = dict(
-            input_ids=np.array(input_ids),
-            input_mask=np.array(input_masks),
-            segment_ids=np.array(segment_ids))
-
-        bert_outputs = self.albert_module(albert_inputs, signature="tokens", as_dict=True)
-        sequence_output = bert_outputs["sequence_output"]
+        albert_inputs = {
+            self.input_ids: np.array(input_ids),
+            self.input_masks: np.array(input_masks),
+            self.segment_ids: np.array(segment_ids)
+        }
 
-        token_embeddings = self.sess.run(sequence_output)
+        token_embeddings = self.sess.run(self.albert_outputs, feed_dict=albert_inputs)["sequence_output"]
 
         if not pooling:
             return token_embeddings

diff --git a/embedding_as_service/text/albert/tokenization.py b/embedding_as_service/text/albert/tokenization.py
@@ -262,15 +262,13 @@ def tokenize(self, text):
 
     def convert_tokens_to_ids(self, tokens):
         if self.sp_model:
-            tf.logging.info("using sentence piece tokenzier.")
             return [self.sp_model.PieceToId(
                 printable_text(token)) for token in tokens]
         else:
             return convert_by_vocab(self.vocab, tokens)
 
     def convert_ids_to_tokens(self, ids):
         if self.sp_model:
-            tf.logging.info("using sentence piece tokenzier.")
             return [self.sp_model.IdToPiece(id_) for id_ in ids]
         else:
             return convert_by_vocab(self.inv_vocab, ids)

diff --git a/embedding_as_service/text/bert/__init__.py b/embedding_as_service/text/bert/__init__.py
@@ -75,29 +75,24 @@ class Embeddings(object):
 
     def __init__(self):
         self.sess = tf.Session()
-        self.bert_module = None
+        self.bert_outputs = None
         self.model_name = None
+        self.max_seq_length = None
 
-    def create_tokenizer_from_hub_module(self, model_path: str):
-        """Get the vocab file and casing info from the Hub module."""
-        tokenization_info = self.bert_module(signature="tokenization_info", as_dict=True)
-        vocab_file, do_lower_case = self.sess.run(
-            [
-                tokenization_info["vocab_file"],
-                tokenization_info["do_lower_case"],
-            ]
-        )
-
-        Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
+        # placeholder definition
+        self.input_ids = None
+        self.input_masks = None
+        self.segment_ids = None
 
     @classmethod
     def tokenize(cls, text):
         return cls.tokenizer.tokenize(text)
 
-    @staticmethod
-    def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool = False
+    def _model_single_input(self, text: Union[str, List[str]],  is_tokenized: bool = False
                             ) -> Tuple[List[int], List[int], List[int]]:
+        max_seq_length = self.max_seq_length
         tokens_a = text
+
         if not is_tokenized:
             tokens_a = Embeddings.tokenize(text)
             if len(tokens_a) > max_seq_length - 2:
@@ -128,38 +123,60 @@ def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tok
         assert len(input_ids) == max_seq_length
         assert len(input_mask) == max_seq_length
         assert len(segment_ids) == max_seq_length
-
         return input_ids, input_mask, segment_ids
 
-    def load_model(self, model: str, model_path: str):
-        self.bert_module = hub.Module(model_path)
-        self.sess.run(tf.initializers.global_variables())
-        self.create_tokenizer_from_hub_module(model_path)
+    def load_model(self, model: str, model_path: str, max_seq_length: int):
+        g = tf.Graph()
+        with g.as_default():
+            self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+            self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+            self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+
+            hub_module = hub.Module(model_path)
+            bert_inputs = dict(
+                input_ids=self.input_ids,
+                input_mask=self.input_masks,
+                segment_ids=self.segment_ids
+            )
+
+            self.bert_outputs = hub_module(bert_inputs, signature="tokens", as_dict=True)
+            tokenization_info = hub_module(signature="tokenization_info", as_dict=True)
+            init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
+        g.finalize()
+        self.sess = tf.Session(graph=g)
+        self.sess.run(init_op)
+        vocab_file, do_lower_case = self.sess.run(
+            [
+                tokenization_info["vocab_file"],
+                tokenization_info["do_lower_case"],
+            ]
+        )
+        Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
+        self.max_seq_length = max_seq_length
         self.model_name = model
+
         print("Model loaded Successfully !")
 
     def encode(self, texts: Union[List[str], List[List[str]]],
                pooling: str,
-               max_seq_length: int,
                is_tokenized: bool = False,
                **kwargs
                ) -> Optional[np.array]:
-        input_ids, input_masks, segment_ids = [], [], []
-        for text in tqdm(texts, desc="Converting texts to features"):
-            input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length, is_tokenized)
-            input_ids.append(input_id)
-            input_masks.append(input_mask)
-            segment_ids.append(segment_id)
-
-        bert_inputs = dict(
-            input_ids=np.array(input_ids),
-            input_mask=np.array(input_masks),
-            segment_ids=np.array(segment_ids))
+        _input_ids, _input_masks, _segment_ids = [], [], []
 
-        bert_outputs = self.bert_module(bert_inputs, signature="tokens", as_dict=True)
-        sequence_output = bert_outputs["sequence_output"]
-
-        token_embeddings = self.sess.run(sequence_output)
+        for text in tqdm(texts, desc="Converting texts to features"):
+            _input_id, _input_mask, _segment_id = self._model_single_input(text, is_tokenized)
+            _input_ids.append(_input_id)
+            _input_masks.append(_input_mask)
+            _segment_ids.append(_segment_id)
+
+        bert_inputs = {
+            self.input_ids: np.array(_input_ids),
+            self.input_masks: np.array(_input_masks),
+            self.segment_ids: np.array(_segment_ids)
+        }
+
+        token_embeddings = self.sess.run(self.bert_outputs, feed_dict=bert_inputs)["sequence_output"]
 
         if not pooling:
             return token_embeddings

diff --git a/embedding_as_service/text/elmo/__init__.py b/embedding_as_service/text/elmo/__init__.py
@@ -24,10 +24,15 @@ class Embeddings(object):
     EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}
 
     def __init__(self):
-        self.elmo_module = None
+        self.elmo_outputs = None
         self.model_name = None
+        self.max_seq_length = None
         self.sess = tf.Session()
 
+        # placeholder
+        self.tokens = None
+        self.sequence_len = None
+
     @classmethod
     def tokenize(cls, text: str):
         return [word.strip() for word in text.lower().strip().split()]
@@ -42,31 +47,44 @@ def padded_tokens(cls, tokens: List[str], max_seq_length: int):
             padded_len = max_seq_length - len_tokens
             return tokens + [padded_token] * padded_len
 
-    def load_model(self, model: str, model_path: str):
-        self.elmo_module = hub.Module(model_path)
-        self.sess.run(tf.initializers.global_variables())
+    def load_model(self, model: str, model_path: str, max_seq_length: int):
+        g = tf.Graph()
+        with g.as_default():
+            hub_module = hub.Module(model_path)
+            self.tokens = tf.placeholder(dtype=tf.string, shape=[None, max_seq_length])
+            self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None])
+
+            elmo_inputs = dict(
+                tokens=self.tokens,
+                sequence_len=self.sequence_len
+            )
+            self.elmo_outputs = hub_module(elmo_inputs, signature="tokens", as_dict=True)
+            init_op = tf.group([tf.global_variables_initializer()])
+        g.finalize()
+        self.sess = tf.Session(graph=g)
+        self.sess.run(init_op)
+
         self.model_name = model
+        self.max_seq_length = max_seq_length
 
     def encode(self, texts: Union[List[str], List[List[str]]],
                pooling: str,
-               max_seq_length: int,
                is_tokenized: bool = False,
                **kwargs
                ) -> Optional[np.array]:
 
         text_tokens = texts
         if not is_tokenized:
             text_tokens = [Embeddings.tokenize(text) for text in texts]
-        if max_seq_length:
-            text_tokens = [Embeddings.padded_tokens(tokens, max_seq_length) for tokens in text_tokens]
-            seq_length = [max_seq_length] * len(texts)
-        else:
-            seq_length = [len(tokens) for tokens in text_tokens]
+        text_tokens = [Embeddings.padded_tokens(tokens, self.max_seq_length) for tokens in text_tokens]
+        seq_length = [self.max_seq_length] * len(texts)
 
-        sequence_output = self.elmo_module(inputs={"tokens": text_tokens, "sequence_len": seq_length},
-                                           signature="tokens", as_dict=True)["elmo"]
+        elmo_inputs = {
+            self.tokens: np.array(text_tokens),
+            self.sequence_len: np.array(seq_length)
+        }
 
-        token_embeddings = self.sess.run(sequence_output)
+        token_embeddings = self.sess.run(self.elmo_outputs, feed_dict=elmo_inputs)["elmo"]
 
         if not pooling:
             return token_embeddings