diff --git a/README.md b/README.md
index bd5bdb5..a5b87cf 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ $ pip install embedding-as-service
#### 1. **Intialise encoder using supported embedding** and models from here
```python
>>> from embedding_as_service.text.encode import Encoder
->>> en = Encoder(embedding='bert', model='bert_base_cased', download=True)
+>>> en = Encoder(embedding='bert', model='bert_base_cased')
```
#### 2. Get sentences **tokens embedding**
```python
@@ -99,8 +99,9 @@ array([[-0.33547154, 0.34566957, 1.1954105 , ..., 0.33702594,
```
#### 4. Use custom `max_seq_length`, default is 128
-```python
->>> vecs = en.encode(texts=['hello aman', 'how are you?'], max_seq_length=256)
+```python
+>>> en = Encoder(embedding='bert', model='bert_base_cased', max_seq_length=256)
+>>> vecs = en.encode(texts=['hello aman', 'how are you?'])
>>> vecs
array([[ 0.48388457, -0.01327741, -0.76577514, ..., -0.54265064,
-0.5564591 , 0.6454179 ], [ 0.53209245, 0.00526248, -0.71091074, ..., -0.5171917 , -0.40458363, 0.6779779 ]], dtype=float32)
@@ -131,7 +132,7 @@ array([[ 0.48388457, -0.01327741, -0.76577514, ..., -0.54265064,
|--------------------|------|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `embedding` | str | *Required* | embedding method to be used, check `Embedding` column here|
| `model`| str |*Required*| Model to be used for mentioned embedding, check `Model` column here|
-| `download`| bool |`False`| Download model if model does not exists|
+| `max_seq_length`| int |128| Maximum Sequence Length, default is 128|
2. **def** `embedding_as_service.text.encoder.Encoder.encode`
@@ -139,7 +140,6 @@ array([[ 0.48388457, -0.01327741, -0.76577514, ..., -0.54265064,
|--------------------|------|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `Texts` | List[str] or List[List[str]] | *Required* | List of sentences or list of list of sentence tokens in case of `is_tokenized=True`
| `pooling`| str |(Optional)| Pooling methods to apply, here is available methods|
-| `max_seq_length`| int | `128` | Maximum Sequence Length, default is 128|
| `is_tokenized` | bool | `False` | set as True in case of tokens are passed for encoding |
| `batch_size` | int | `128` | maximum number of sequences handled by encoder, larger batch will be partitioned into small batches. |
@@ -211,9 +211,9 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
Aman Srivastava 💻 📖 🚇 |
Ashutosh Singh 💻 📖 🚇 |
+ Chirag Jain 💻 📖 🚇 |
MrPranav101 💻 📖 🚇 |
Dhaval Taunk 💻 📖 🚇 |
- Chirag Jain 💻 📖 🚇 |
diff --git a/embedding_as_service/text/albert/__init__.py b/embedding_as_service/text/albert/__init__.py
index 512ee81..5383fb4 100644
--- a/embedding_as_service/text/albert/__init__.py
+++ b/embedding_as_service/text/albert/__init__.py
@@ -60,12 +60,18 @@ class Embeddings(object):
def __init__(self):
self.sess = tf.Session()
- self.albert_module = None
+ self.albert_outputs = None
self.model_name = None
+ self.max_seq_length = None
+
+ # placeholder definition
+ self.input_ids = None
+ self.input_masks = None
+ self.segment_ids = None
def create_tokenizer_from_hub_module(self):
"""Get the vocab file and casing info from the Hub module."""
- tokenization_info = self.albert_module(signature="tokenization_info", as_dict=True)
+ tokenization_info = self.albert_outputs(signature="tokenization_info", as_dict=True)
sentence_piece_file, do_lower_case = self.sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
@@ -78,9 +84,9 @@ def create_tokenizer_from_hub_module(self):
def tokenize(cls, text):
return cls.tokenizer.tokenize(text)
- @staticmethod
- def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool = False
+ def _model_single_input(self, text: Union[str, List[str]], is_tokenized: bool = False
) -> Tuple[List[int], List[int], List[int]]:
+ max_seq_length = self.max_seq_length
tokens_a = text
if not is_tokenized:
tokens_a = Embeddings.tokenize(text)
@@ -115,35 +121,54 @@ def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tok
return input_ids, input_mask, segment_ids
- def load_model(self, model: str, model_path: str):
- self.albert_module = hub.Module(model_path)
- self.sess.run(tf.initializers.global_variables())
- self.create_tokenizer_from_hub_module()
+ def load_model(self, model: str, model_path: str, max_seq_length: int):
+ g = tf.Graph()
+ with g.as_default():
+ self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+ self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+ self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+
+ hub_module = hub.Module(model_path)
+ albert_inputs = dict(
+ input_ids=self.input_ids,
+ input_mask=self.input_masks,
+ segment_ids=self.segment_ids
+ )
+ self.albert_outputs = hub_module(albert_inputs, signature="tokens", as_dict=True)
+ tokenization_info = hub_module(signature="tokenization_info", as_dict=True)
+ init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
+ g.finalize()
+ self.sess = tf.Session(graph=g)
+ self.sess.run(init_op)
+ sentence_piece_file, do_lower_case = self.sess.run(
+ [tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]
+ )
+ Embeddings.tokenizer = FullTokenizer(vocab_file=None,
+ do_lower_case=do_lower_case,
+ spm_model_file=sentence_piece_file)
+ self.max_seq_length = max_seq_length
self.model_name = model
print("Model loaded Successfully !")
def encode(self, texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
input_ids, input_masks, segment_ids = [], [], []
for text in tqdm(texts, desc="Converting texts to features"):
- input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length, is_tokenized)
+ input_id, input_mask, segment_id = self._model_single_input(text, is_tokenized)
input_ids.append(input_id)
input_masks.append(input_mask)
segment_ids.append(segment_id)
- albert_inputs = dict(
- input_ids=np.array(input_ids),
- input_mask=np.array(input_masks),
- segment_ids=np.array(segment_ids))
-
- bert_outputs = self.albert_module(albert_inputs, signature="tokens", as_dict=True)
- sequence_output = bert_outputs["sequence_output"]
+ albert_inputs = {
+ self.input_ids: np.array(input_ids),
+ self.input_masks: np.array(input_masks),
+ self.segment_ids: np.array(segment_ids)
+ }
- token_embeddings = self.sess.run(sequence_output)
+ token_embeddings = self.sess.run(self.albert_outputs, feed_dict=albert_inputs)["sequence_output"]
if not pooling:
return token_embeddings
diff --git a/embedding_as_service/text/albert/tokenization.py b/embedding_as_service/text/albert/tokenization.py
index 88fd43a..e1759ad 100644
--- a/embedding_as_service/text/albert/tokenization.py
+++ b/embedding_as_service/text/albert/tokenization.py
@@ -262,7 +262,6 @@ def tokenize(self, text):
def convert_tokens_to_ids(self, tokens):
if self.sp_model:
- tf.logging.info("using sentence piece tokenzier.")
return [self.sp_model.PieceToId(
printable_text(token)) for token in tokens]
else:
@@ -270,7 +269,6 @@ def convert_tokens_to_ids(self, tokens):
def convert_ids_to_tokens(self, ids):
if self.sp_model:
- tf.logging.info("using sentence piece tokenzier.")
return [self.sp_model.IdToPiece(id_) for id_ in ids]
else:
return convert_by_vocab(self.inv_vocab, ids)
diff --git a/embedding_as_service/text/bert/__init__.py b/embedding_as_service/text/bert/__init__.py
index 99faeb4..879afbe 100644
--- a/embedding_as_service/text/bert/__init__.py
+++ b/embedding_as_service/text/bert/__init__.py
@@ -75,29 +75,24 @@ class Embeddings(object):
def __init__(self):
self.sess = tf.Session()
- self.bert_module = None
+ self.bert_outputs = None
self.model_name = None
+ self.max_seq_length = None
- def create_tokenizer_from_hub_module(self, model_path: str):
- """Get the vocab file and casing info from the Hub module."""
- tokenization_info = self.bert_module(signature="tokenization_info", as_dict=True)
- vocab_file, do_lower_case = self.sess.run(
- [
- tokenization_info["vocab_file"],
- tokenization_info["do_lower_case"],
- ]
- )
-
- Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
+ # placeholder definition
+ self.input_ids = None
+ self.input_masks = None
+ self.segment_ids = None
@classmethod
def tokenize(cls, text):
return cls.tokenizer.tokenize(text)
- @staticmethod
- def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool = False
+ def _model_single_input(self, text: Union[str, List[str]], is_tokenized: bool = False
) -> Tuple[List[int], List[int], List[int]]:
+ max_seq_length = self.max_seq_length
tokens_a = text
+
if not is_tokenized:
tokens_a = Embeddings.tokenize(text)
if len(tokens_a) > max_seq_length - 2:
@@ -128,38 +123,60 @@ def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tok
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
-
return input_ids, input_mask, segment_ids
- def load_model(self, model: str, model_path: str):
- self.bert_module = hub.Module(model_path)
- self.sess.run(tf.initializers.global_variables())
- self.create_tokenizer_from_hub_module(model_path)
+ def load_model(self, model: str, model_path: str, max_seq_length: int):
+ g = tf.Graph()
+ with g.as_default():
+ self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+ self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+ self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
+
+ hub_module = hub.Module(model_path)
+ bert_inputs = dict(
+ input_ids=self.input_ids,
+ input_mask=self.input_masks,
+ segment_ids=self.segment_ids
+ )
+
+ self.bert_outputs = hub_module(bert_inputs, signature="tokens", as_dict=True)
+ tokenization_info = hub_module(signature="tokenization_info", as_dict=True)
+ init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
+ g.finalize()
+ self.sess = tf.Session(graph=g)
+ self.sess.run(init_op)
+ vocab_file, do_lower_case = self.sess.run(
+ [
+ tokenization_info["vocab_file"],
+ tokenization_info["do_lower_case"],
+ ]
+ )
+ Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
+ self.max_seq_length = max_seq_length
self.model_name = model
+
print("Model loaded Successfully !")
def encode(self, texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
- input_ids, input_masks, segment_ids = [], [], []
- for text in tqdm(texts, desc="Converting texts to features"):
- input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length, is_tokenized)
- input_ids.append(input_id)
- input_masks.append(input_mask)
- segment_ids.append(segment_id)
-
- bert_inputs = dict(
- input_ids=np.array(input_ids),
- input_mask=np.array(input_masks),
- segment_ids=np.array(segment_ids))
+ _input_ids, _input_masks, _segment_ids = [], [], []
- bert_outputs = self.bert_module(bert_inputs, signature="tokens", as_dict=True)
- sequence_output = bert_outputs["sequence_output"]
-
- token_embeddings = self.sess.run(sequence_output)
+ for text in tqdm(texts, desc="Converting texts to features"):
+ _input_id, _input_mask, _segment_id = self._model_single_input(text, is_tokenized)
+ _input_ids.append(_input_id)
+ _input_masks.append(_input_mask)
+ _segment_ids.append(_segment_id)
+
+ bert_inputs = {
+ self.input_ids: np.array(_input_ids),
+ self.input_masks: np.array(_input_masks),
+ self.segment_ids: np.array(_segment_ids)
+ }
+
+ token_embeddings = self.sess.run(self.bert_outputs, feed_dict=bert_inputs)["sequence_output"]
if not pooling:
return token_embeddings
diff --git a/embedding_as_service/text/elmo/__init__.py b/embedding_as_service/text/elmo/__init__.py
index f36b51d..9ac4c00 100644
--- a/embedding_as_service/text/elmo/__init__.py
+++ b/embedding_as_service/text/elmo/__init__.py
@@ -24,10 +24,15 @@ class Embeddings(object):
EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}
def __init__(self):
- self.elmo_module = None
+ self.elmo_outputs = None
self.model_name = None
+ self.max_seq_length = None
self.sess = tf.Session()
+ # placeholder
+ self.tokens = None
+ self.sequence_len = None
+
@classmethod
def tokenize(cls, text: str):
return [word.strip() for word in text.lower().strip().split()]
@@ -42,14 +47,28 @@ def padded_tokens(cls, tokens: List[str], max_seq_length: int):
padded_len = max_seq_length - len_tokens
return tokens + [padded_token] * padded_len
- def load_model(self, model: str, model_path: str):
- self.elmo_module = hub.Module(model_path)
- self.sess.run(tf.initializers.global_variables())
+ def load_model(self, model: str, model_path: str, max_seq_length: int):
+ g = tf.Graph()
+ with g.as_default():
+ hub_module = hub.Module(model_path)
+ self.tokens = tf.placeholder(dtype=tf.string, shape=[None, max_seq_length])
+ self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None])
+
+ elmo_inputs = dict(
+ tokens=self.tokens,
+ sequence_len=self.sequence_len
+ )
+ self.elmo_outputs = hub_module(elmo_inputs, signature="tokens", as_dict=True)
+ init_op = tf.group([tf.global_variables_initializer()])
+ g.finalize()
+ self.sess = tf.Session(graph=g)
+ self.sess.run(init_op)
+
self.model_name = model
+ self.max_seq_length = max_seq_length
def encode(self, texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
@@ -57,16 +76,15 @@ def encode(self, texts: Union[List[str], List[List[str]]],
text_tokens = texts
if not is_tokenized:
text_tokens = [Embeddings.tokenize(text) for text in texts]
- if max_seq_length:
- text_tokens = [Embeddings.padded_tokens(tokens, max_seq_length) for tokens in text_tokens]
- seq_length = [max_seq_length] * len(texts)
- else:
- seq_length = [len(tokens) for tokens in text_tokens]
+ text_tokens = [Embeddings.padded_tokens(tokens, self.max_seq_length) for tokens in text_tokens]
+ seq_length = [self.max_seq_length] * len(texts)
- sequence_output = self.elmo_module(inputs={"tokens": text_tokens, "sequence_len": seq_length},
- signature="tokens", as_dict=True)["elmo"]
+ elmo_inputs = {
+ self.tokens: np.array(text_tokens),
+ self.sequence_len: np.array(seq_length)
+ }
- token_embeddings = self.sess.run(sequence_output)
+ token_embeddings = self.sess.run(self.elmo_outputs, feed_dict=elmo_inputs)["elmo"]
if not pooling:
return token_embeddings
diff --git a/embedding_as_service/text/encode.py b/embedding_as_service/text/encode.py
index f1ea789..620e4d7 100644
--- a/embedding_as_service/text/encode.py
+++ b/embedding_as_service/text/encode.py
@@ -8,11 +8,12 @@
class Encoder(object, metaclass=ArgSingleton):
- def __init__(self, embedding: str, model: str, download: bool = False):
+ def __init__(self, embedding: str, model: str, max_seq_length: int = 128):
self.embedding = embedding
self.model = model
self.embedding_model_dict = None
self.model_path = None
+ self.max_seq_length = max_seq_length
supported_embeddings = self.get_supported_embeddings()
@@ -30,7 +31,7 @@ def __init__(self, embedding: str, model: str, download: bool = False):
raise ValueError(f"Given embedding \"{embedding}\" does not have support for model \"{model}\", "
f"the supported models are: {model_names}")
- self.model_path = self._get_or_download_model(download)
+ self.model_path = self._get_or_download_model(download=True)
if not self.model_path:
print(f"Model does not exits, pass download param as True")
return
@@ -89,7 +90,7 @@ def _get_or_download_model(self, download: bool) -> Optional[str]:
return model_path
def _load_model(self):
- self.embedding_cls.load_model(self.model, self.model_path)
+ self.embedding_cls.load_model(self.model, self.model_path, self.max_seq_length)
return
def tokenize(self, texts: Union[List[str], str]) -> np.array:
@@ -106,7 +107,6 @@ def tokenize(self, texts: Union[List[str], str]) -> np.array:
def encode(self,
texts: Union[List[str], List[List[str]]],
pooling: Optional[str] = None,
- max_seq_length: Optional[int] = 128,
is_tokenized: bool = False,
batch_size: int = 128,
** kwargs
@@ -115,12 +115,12 @@ def encode(self,
raise ValueError('Argument `texts` should be either List[str] or List[List[str]]')
if is_tokenized:
if not all(isinstance(text, list) for text in texts):
- raise ValueError('Argument `texts` should be List[List[str]] (list of tokens) when `is_tokenized` = True')
+ raise ValueError('Argument `texts` should be List[List[str]] (list of tokens) '
+ 'when `is_tokenized` = True')
embeddings = []
for i in range(0, len(texts), batch_size):
vectors = self.embedding_cls.encode(texts=texts[i: i + batch_size],
pooling=pooling,
- max_seq_length=max_seq_length,
is_tokenized=is_tokenized)
embeddings.append(vectors)
embeddings = np.vstack(embeddings)
diff --git a/embedding_as_service/text/fasttext/__init__.py b/embedding_as_service/text/fasttext/__init__.py
index ed289ef..1f7c4bc 100644
--- a/embedding_as_service/text/fasttext/__init__.py
+++ b/embedding_as_service/text/fasttext/__init__.py
@@ -61,12 +61,13 @@ class Embeddings(object):
def __init__(self):
self.word_vectors: Dict[Any, Any] = {}
self.model_name = None
+ self.max_seq_length = None
@classmethod
def tokenize(cls, text):
return [x.lower().strip() for x in text.split()]
- def load_model(self, model: str, model_path: str):
+ def load_model(self, model: str, model_path: str, max_seq_length: int):
try:
model_file = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))]
f = open(os.path.join(model_path, model_file[0]), 'r')
@@ -77,14 +78,15 @@ def load_model(self, model: str, model_path: str):
self.word_vectors[word] = np.array([float(val) for val in split_line[1:]])
print("Model loaded Successfully !")
self.model_name = model
+ self.max_seq_length = max_seq_length
return self
except Exception as e:
print('Error loading Model, ', str(e))
return self
- def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, max_seq_length: int,
+ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array,
is_tokenized: bool):
-
+ max_seq_length = self.max_seq_length
tokens = text
if not is_tokenized:
tokens = Embeddings.tokenize(text)
@@ -96,12 +98,11 @@ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array,
def encode(self, texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32")
- token_embeddings = np.array([self._single_encode_text(text, oov_vector, max_seq_length, is_tokenized)
+ token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized)
for text in texts])
if not pooling:
diff --git a/embedding_as_service/text/glove/__init__.py b/embedding_as_service/text/glove/__init__.py
index 77c94a2..f60c06d 100644
--- a/embedding_as_service/text/glove/__init__.py
+++ b/embedding_as_service/text/glove/__init__.py
@@ -114,12 +114,13 @@ class Embeddings(object):
def __init__(self):
self.word_vectors: Dict[Any, Any] = {}
self.model_name = None
+ self.max_seq_length = None
@classmethod
def tokenize(cls, text: str) -> List[str]:
return [x.lower().strip() for x in text.split()]
- def load_model(self, model: str, model_path: str):
+ def load_model(self, model: str, model_path: str, max_seq_length: int):
try:
model_file = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))]
f = open(os.path.join(model_path, model_file[0]), 'r')
@@ -129,14 +130,15 @@ def load_model(self, model: str, model_path: str):
self.word_vectors[word] = np.array([float(val) for val in split_line[1:]])
print("Model loaded Successfully !")
self.model_name = model
+ self.max_seq_length = max_seq_length
return self
except Exception as e:
print('Error loading Model, ', str(e))
return self
- def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, max_seq_length: int,
+ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array,
is_tokenized: bool):
-
+ max_seq_length = self.max_seq_length
tokens = text
if not is_tokenized:
tokens = Embeddings.tokenize(text)
@@ -148,12 +150,11 @@ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array,
def encode(self, texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32")
- token_embeddings = np.array([self._single_encode_text(text, oov_vector, max_seq_length, is_tokenized)
+ token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized)
for text in texts])
if not pooling:
diff --git a/embedding_as_service/text/ulmfit/__init__.py b/embedding_as_service/text/ulmfit/__init__.py
index db08c8a..c09b2aa 100644
--- a/embedding_as_service/text/ulmfit/__init__.py
+++ b/embedding_as_service/text/ulmfit/__init__.py
@@ -67,7 +67,6 @@ def load_model(self, model: str, model_path: str):
def encode(self, texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
diff --git a/embedding_as_service/text/use/__init__.py b/embedding_as_service/text/use/__init__.py
index 490a8e6..f62fccc 100644
--- a/embedding_as_service/text/use/__init__.py
+++ b/embedding_as_service/text/use/__init__.py
@@ -4,6 +4,7 @@
from embedding_as_service.text import Embedding
import tensorflow as tf
import tensorflow_hub as hub
+import sentencepiece as spm
class Embeddings(object):
@@ -44,18 +45,70 @@ class Embeddings(object):
def __init__(self):
self.sess = tf.Session()
self.sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
- self.use_module = None
+ self.use_outputs = None
self.model_name = None
+ self.max_seq_length = None
+
+ # placeholder for dan and large model
+ self.sentences = None
+
+ # sentencepiece and place holder model for lite version
+ self.sp_model = spm.SentencePieceProcessor()
+ self.input_placeholder = None
+
+ def process_to_ids_in_sparse_format(self, sentences):
+ # An utility method that processes sentences with the sentence piece processor
+ # 'sp' and returns the results in tf.SparseTensor-similar format:
+ # (values, indices, dense_shape)
+ ids = [self.sp_model.EncodeAsIds(x) for x in sentences]
+ max_len = max(len(x) for x in ids)
+ dense_shape = (len(ids), max_len)
+ values = [item for sublist in ids for item in sublist]
+ indices = [[row, col] for row in range(len(ids)) for col in range(len(ids[row]))]
+ return values, indices, dense_shape
+
+ def load_model(self, model: str, model_path: str, max_seq_length: int):
+ spm_path_info = None
+ g = tf.Graph()
+ with g.as_default():
+ hub_module = hub.Module(model_path)
+ if model == 'use_transformer_lite':
+ self.input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])
+ self.use_outputs = hub_module(
+ inputs=dict(
+ values=self.input_placeholder.values,
+ indices=self.input_placeholder.indices,
+ dense_shape=self.input_placeholder.dense_shape)
+ )
+ spm_path_info = hub_module(signature="spm_path")
+ else:
+ self.sentences = tf.placeholder(tf.string, shape=[None])
+ self.use_outputs = hub_module(self.sentences, as_dict=True)
+ init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
+
+ g.finalize()
+ self.sess = tf.Session(graph=g)
+ self.sess.run(init_op)
+
+ if model == 'use_transformer_lite':
+ spm_path = self.sess.run(spm_path_info)
+ self.sp_model.Load(spm_path)
- def load_model(self, model: str, model_path: str):
- self.use_module = hub.Module(model_path)
- self.sess.run(tf.initializers.global_variables())
self.model_name = model
+ self.max_seq_length = max_seq_length
def encode(self, texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
- return self.sess.run(self.use_module(texts))
+ if self.model_name == 'use_transformer_lite':
+ values, indices, dense_shape = self.process_to_ids_in_sparse_format(texts)
+ embeddings = self.sess.run(self.use_outputs, feed_dict={
+ self.input_placeholder.values: values,
+ self.input_placeholder.indices: indices,
+ self.input_placeholder.dense_shape: dense_shape
+ })
+ else:
+ embeddings = self.sess.run(self.use_outputs, feed_dict={self.sentences: texts})["default"]
+ return embeddings
diff --git a/embedding_as_service/text/word2vec/__init__.py b/embedding_as_service/text/word2vec/__init__.py
index 45f2306..55533e2 100644
--- a/embedding_as_service/text/word2vec/__init__.py
+++ b/embedding_as_service/text/word2vec/__init__.py
@@ -28,12 +28,13 @@ class Embeddings(object):
def __init__(self):
self.word_vectors: Dict[Any, Any] = {}
self.model_name = None
+ self.max_seq_length = None
@classmethod
def tokenize(cls, text: str) -> List[str]:
return [x.lower().strip() for x in text.split()]
- def load_model(self, model: str, model_path: str):
+ def load_model(self, model: str, model_path: str, max_seq_length: int):
try:
encoding = 'utf-8'
unicode_errors = 'strict'
@@ -61,14 +62,15 @@ def load_model(self, model: str, model_path: str):
self.word_vectors[word] = weights
self.model_name = model
+ self.max_seq_length = max_seq_length
print("Model loaded Successfully !")
return self
except Exception as e:
print('Error loading Model, ', str(e))
- def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, max_seq_length: int,
+ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array,
is_tokenized: bool):
-
+ max_seq_length = self.max_seq_length
tokens = text
if not is_tokenized:
tokens = Embeddings.tokenize(text)
@@ -80,12 +82,11 @@ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array,
def encode(self, texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32")
- token_embeddings = np.array([self._single_encode_text(text, oov_vector, max_seq_length, is_tokenized)
+ token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized)
for text in texts])
if not pooling:
diff --git a/embedding_as_service/text/xlnet/__init__.py b/embedding_as_service/text/xlnet/__init__.py
index ac93822..163c7ab 100644
--- a/embedding_as_service/text/xlnet/__init__.py
+++ b/embedding_as_service/text/xlnet/__init__.py
@@ -58,6 +58,7 @@ def __init__(self):
self.xlnet_config = None
self.run_config = None
self.model_name = None
+ self.max_seq_length = None
self.sess = tf.Session()
@staticmethod
@@ -72,10 +73,9 @@ def tokenize(cls, text):
text = preprocess_text(text, lower=False)
return encode_pieces(cls.tokenizer, text)
- @staticmethod
- def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool
+ def _model_single_input(self, text: Union[str, List[str]], is_tokenized: bool
) -> Tuple[List[int], List[int], List[int]]:
-
+ max_seq_length = self.max_seq_length
tokens_a = text
if not is_tokenized:
tokens_a = Embeddings.tokenize(text)
@@ -115,24 +115,24 @@ def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tok
return input_ids, input_mask, segment_ids
- def load_model(self, model: str, model_path: str):
+ def load_model(self, model: str, model_path: str, max_seq_length: int):
model_path = os.path.join(model_path, next(os.walk(model_path))[1][0])
self.xlnet_config = xlnet.XLNetConfig(json_path=os.path.join(model_path, Embeddings.mode_config_path))
self.run_config = xlnet.create_run_config(is_training=True, is_finetune=True, FLAGS=Flags)
self.load_tokenizer(model_path)
+ self.max_seq_length = max_seq_length
self.model_name = model
print("Model loaded Successfully !")
def encode(self,
texts: Union[List[str], List[List[str]]],
pooling: str,
- max_seq_length: int,
is_tokenized: bool = False,
**kwargs
) -> Optional[np.array]:
input_ids, input_masks, segment_ids = [], [], []
for text in tqdm(texts, desc="Converting texts to features"):
- input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length, is_tokenized)
+ input_id, input_mask, segment_id = self._model_single_input(text, is_tokenized)
input_ids.append(input_id)
input_masks.append(input_mask)
segment_ids.append(segment_id)
diff --git a/requirements.txt b/requirements.txt
index b1730ec..6bfd5b7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ smart_open==1.8.4
sentencepiece==0.1.82
tensorflow==1.15.0
setuptools>=41.0.0
+sentencepiece==0.1.85