diff --git a/README.md b/README.md index bd5bdb5..a5b87cf 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ $ pip install embedding-as-service #### 1. **Intialise encoder using supported embedding** and models from here ```python >>> from embedding_as_service.text.encode import Encoder ->>> en = Encoder(embedding='bert', model='bert_base_cased', download=True) +>>> en = Encoder(embedding='bert', model='bert_base_cased') ``` #### 2. Get sentences **tokens embedding** ```python @@ -99,8 +99,9 @@ array([[-0.33547154, 0.34566957, 1.1954105 , ..., 0.33702594, ``` #### 4. Use custom `max_seq_length`, default is 128 -```python ->>> vecs = en.encode(texts=['hello aman', 'how are you?'], max_seq_length=256) +```python +>>> en = Encoder(embedding='bert', model='bert_base_cased', max_seq_length=256) +>>> vecs = en.encode(texts=['hello aman', 'how are you?']) >>> vecs array([[ 0.48388457, -0.01327741, -0.76577514, ..., -0.54265064, -0.5564591 , 0.6454179 ], [ 0.53209245, 0.00526248, -0.71091074, ..., -0.5171917 , -0.40458363, 0.6779779 ]], dtype=float32) @@ -131,7 +132,7 @@ array([[ 0.48388457, -0.01327741, -0.76577514, ..., -0.54265064, |--------------------|------|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------| | `embedding` | str | *Required* | embedding method to be used, check `Embedding` column here| | `model`| str |*Required*| Model to be used for mentioned embedding, check `Model` column here| -| `download`| bool |`False`| Download model if model does not exists| +| `max_seq_length`| int |128| Maximum Sequence Length, default is 128| 2. **def** `embedding_as_service.text.encoder.Encoder.encode` @@ -139,7 +140,6 @@ array([[ 0.48388457, -0.01327741, -0.76577514, ..., -0.54265064, |--------------------|------|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------| | `Texts` | List[str] or List[List[str]] | *Required* | List of sentences or list of list of sentence tokens in case of `is_tokenized=True` | `pooling`| str |(Optional)| Pooling methods to apply, here is available methods| -| `max_seq_length`| int | `128` | Maximum Sequence Length, default is 128| | `is_tokenized` | bool | `False` | set as True in case of tokens are passed for encoding | | `batch_size` | int | `128` | maximum number of sequences handled by encoder, larger batch will be partitioned into small batches. | @@ -211,9 +211,9 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d Aman Srivastava
Aman Srivastava

💻 📖 🚇 Ashutosh Singh
Ashutosh Singh

💻 📖 🚇 + Chirag Jain
Chirag Jain

💻 📖 🚇 MrPranav101
MrPranav101

💻 📖 🚇 Dhaval Taunk
Dhaval Taunk

💻 📖 🚇 - Chirag Jain
Chirag Jain

💻 📖 🚇 diff --git a/embedding_as_service/text/albert/__init__.py b/embedding_as_service/text/albert/__init__.py index 512ee81..5383fb4 100644 --- a/embedding_as_service/text/albert/__init__.py +++ b/embedding_as_service/text/albert/__init__.py @@ -60,12 +60,18 @@ class Embeddings(object): def __init__(self): self.sess = tf.Session() - self.albert_module = None + self.albert_outputs = None self.model_name = None + self.max_seq_length = None + + # placeholder definition + self.input_ids = None + self.input_masks = None + self.segment_ids = None def create_tokenizer_from_hub_module(self): """Get the vocab file and casing info from the Hub module.""" - tokenization_info = self.albert_module(signature="tokenization_info", as_dict=True) + tokenization_info = self.albert_outputs(signature="tokenization_info", as_dict=True) sentence_piece_file, do_lower_case = self.sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]) @@ -78,9 +84,9 @@ def create_tokenizer_from_hub_module(self): def tokenize(cls, text): return cls.tokenizer.tokenize(text) - @staticmethod - def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool = False + def _model_single_input(self, text: Union[str, List[str]], is_tokenized: bool = False ) -> Tuple[List[int], List[int], List[int]]: + max_seq_length = self.max_seq_length tokens_a = text if not is_tokenized: tokens_a = Embeddings.tokenize(text) @@ -115,35 +121,54 @@ def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tok return input_ids, input_mask, segment_ids - def load_model(self, model: str, model_path: str): - self.albert_module = hub.Module(model_path) - self.sess.run(tf.initializers.global_variables()) - self.create_tokenizer_from_hub_module() + def load_model(self, model: str, model_path: str, max_seq_length: int): + g = tf.Graph() + with g.as_default(): + self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) + self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) + self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) + + hub_module = hub.Module(model_path) + albert_inputs = dict( + input_ids=self.input_ids, + input_mask=self.input_masks, + segment_ids=self.segment_ids + ) + self.albert_outputs = hub_module(albert_inputs, signature="tokens", as_dict=True) + tokenization_info = hub_module(signature="tokenization_info", as_dict=True) + init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()]) + g.finalize() + self.sess = tf.Session(graph=g) + self.sess.run(init_op) + sentence_piece_file, do_lower_case = self.sess.run( + [tokenization_info["vocab_file"], tokenization_info["do_lower_case"]] + ) + Embeddings.tokenizer = FullTokenizer(vocab_file=None, + do_lower_case=do_lower_case, + spm_model_file=sentence_piece_file) + self.max_seq_length = max_seq_length self.model_name = model print("Model loaded Successfully !") def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: input_ids, input_masks, segment_ids = [], [], [] for text in tqdm(texts, desc="Converting texts to features"): - input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length, is_tokenized) + input_id, input_mask, segment_id = self._model_single_input(text, is_tokenized) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) - albert_inputs = dict( - input_ids=np.array(input_ids), - input_mask=np.array(input_masks), - segment_ids=np.array(segment_ids)) - - bert_outputs = self.albert_module(albert_inputs, signature="tokens", as_dict=True) - sequence_output = bert_outputs["sequence_output"] + albert_inputs = { + self.input_ids: np.array(input_ids), + self.input_masks: np.array(input_masks), + self.segment_ids: np.array(segment_ids) + } - token_embeddings = self.sess.run(sequence_output) + token_embeddings = self.sess.run(self.albert_outputs, feed_dict=albert_inputs)["sequence_output"] if not pooling: return token_embeddings diff --git a/embedding_as_service/text/albert/tokenization.py b/embedding_as_service/text/albert/tokenization.py index 88fd43a..e1759ad 100644 --- a/embedding_as_service/text/albert/tokenization.py +++ b/embedding_as_service/text/albert/tokenization.py @@ -262,7 +262,6 @@ def tokenize(self, text): def convert_tokens_to_ids(self, tokens): if self.sp_model: - tf.logging.info("using sentence piece tokenzier.") return [self.sp_model.PieceToId( printable_text(token)) for token in tokens] else: @@ -270,7 +269,6 @@ def convert_tokens_to_ids(self, tokens): def convert_ids_to_tokens(self, ids): if self.sp_model: - tf.logging.info("using sentence piece tokenzier.") return [self.sp_model.IdToPiece(id_) for id_ in ids] else: return convert_by_vocab(self.inv_vocab, ids) diff --git a/embedding_as_service/text/bert/__init__.py b/embedding_as_service/text/bert/__init__.py index 99faeb4..879afbe 100644 --- a/embedding_as_service/text/bert/__init__.py +++ b/embedding_as_service/text/bert/__init__.py @@ -75,29 +75,24 @@ class Embeddings(object): def __init__(self): self.sess = tf.Session() - self.bert_module = None + self.bert_outputs = None self.model_name = None + self.max_seq_length = None - def create_tokenizer_from_hub_module(self, model_path: str): - """Get the vocab file and casing info from the Hub module.""" - tokenization_info = self.bert_module(signature="tokenization_info", as_dict=True) - vocab_file, do_lower_case = self.sess.run( - [ - tokenization_info["vocab_file"], - tokenization_info["do_lower_case"], - ] - ) - - Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) + # placeholder definition + self.input_ids = None + self.input_masks = None + self.segment_ids = None @classmethod def tokenize(cls, text): return cls.tokenizer.tokenize(text) - @staticmethod - def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool = False + def _model_single_input(self, text: Union[str, List[str]], is_tokenized: bool = False ) -> Tuple[List[int], List[int], List[int]]: + max_seq_length = self.max_seq_length tokens_a = text + if not is_tokenized: tokens_a = Embeddings.tokenize(text) if len(tokens_a) > max_seq_length - 2: @@ -128,38 +123,60 @@ def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tok assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length - return input_ids, input_mask, segment_ids - def load_model(self, model: str, model_path: str): - self.bert_module = hub.Module(model_path) - self.sess.run(tf.initializers.global_variables()) - self.create_tokenizer_from_hub_module(model_path) + def load_model(self, model: str, model_path: str, max_seq_length: int): + g = tf.Graph() + with g.as_default(): + self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) + self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) + self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length]) + + hub_module = hub.Module(model_path) + bert_inputs = dict( + input_ids=self.input_ids, + input_mask=self.input_masks, + segment_ids=self.segment_ids + ) + + self.bert_outputs = hub_module(bert_inputs, signature="tokens", as_dict=True) + tokenization_info = hub_module(signature="tokenization_info", as_dict=True) + init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()]) + g.finalize() + self.sess = tf.Session(graph=g) + self.sess.run(init_op) + vocab_file, do_lower_case = self.sess.run( + [ + tokenization_info["vocab_file"], + tokenization_info["do_lower_case"], + ] + ) + Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) + self.max_seq_length = max_seq_length self.model_name = model + print("Model loaded Successfully !") def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: - input_ids, input_masks, segment_ids = [], [], [] - for text in tqdm(texts, desc="Converting texts to features"): - input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length, is_tokenized) - input_ids.append(input_id) - input_masks.append(input_mask) - segment_ids.append(segment_id) - - bert_inputs = dict( - input_ids=np.array(input_ids), - input_mask=np.array(input_masks), - segment_ids=np.array(segment_ids)) + _input_ids, _input_masks, _segment_ids = [], [], [] - bert_outputs = self.bert_module(bert_inputs, signature="tokens", as_dict=True) - sequence_output = bert_outputs["sequence_output"] - - token_embeddings = self.sess.run(sequence_output) + for text in tqdm(texts, desc="Converting texts to features"): + _input_id, _input_mask, _segment_id = self._model_single_input(text, is_tokenized) + _input_ids.append(_input_id) + _input_masks.append(_input_mask) + _segment_ids.append(_segment_id) + + bert_inputs = { + self.input_ids: np.array(_input_ids), + self.input_masks: np.array(_input_masks), + self.segment_ids: np.array(_segment_ids) + } + + token_embeddings = self.sess.run(self.bert_outputs, feed_dict=bert_inputs)["sequence_output"] if not pooling: return token_embeddings diff --git a/embedding_as_service/text/elmo/__init__.py b/embedding_as_service/text/elmo/__init__.py index f36b51d..9ac4c00 100644 --- a/embedding_as_service/text/elmo/__init__.py +++ b/embedding_as_service/text/elmo/__init__.py @@ -24,10 +24,15 @@ class Embeddings(object): EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS} def __init__(self): - self.elmo_module = None + self.elmo_outputs = None self.model_name = None + self.max_seq_length = None self.sess = tf.Session() + # placeholder + self.tokens = None + self.sequence_len = None + @classmethod def tokenize(cls, text: str): return [word.strip() for word in text.lower().strip().split()] @@ -42,14 +47,28 @@ def padded_tokens(cls, tokens: List[str], max_seq_length: int): padded_len = max_seq_length - len_tokens return tokens + [padded_token] * padded_len - def load_model(self, model: str, model_path: str): - self.elmo_module = hub.Module(model_path) - self.sess.run(tf.initializers.global_variables()) + def load_model(self, model: str, model_path: str, max_seq_length: int): + g = tf.Graph() + with g.as_default(): + hub_module = hub.Module(model_path) + self.tokens = tf.placeholder(dtype=tf.string, shape=[None, max_seq_length]) + self.sequence_len = tf.placeholder(dtype=tf.int32, shape=[None]) + + elmo_inputs = dict( + tokens=self.tokens, + sequence_len=self.sequence_len + ) + self.elmo_outputs = hub_module(elmo_inputs, signature="tokens", as_dict=True) + init_op = tf.group([tf.global_variables_initializer()]) + g.finalize() + self.sess = tf.Session(graph=g) + self.sess.run(init_op) + self.model_name = model + self.max_seq_length = max_seq_length def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: @@ -57,16 +76,15 @@ def encode(self, texts: Union[List[str], List[List[str]]], text_tokens = texts if not is_tokenized: text_tokens = [Embeddings.tokenize(text) for text in texts] - if max_seq_length: - text_tokens = [Embeddings.padded_tokens(tokens, max_seq_length) for tokens in text_tokens] - seq_length = [max_seq_length] * len(texts) - else: - seq_length = [len(tokens) for tokens in text_tokens] + text_tokens = [Embeddings.padded_tokens(tokens, self.max_seq_length) for tokens in text_tokens] + seq_length = [self.max_seq_length] * len(texts) - sequence_output = self.elmo_module(inputs={"tokens": text_tokens, "sequence_len": seq_length}, - signature="tokens", as_dict=True)["elmo"] + elmo_inputs = { + self.tokens: np.array(text_tokens), + self.sequence_len: np.array(seq_length) + } - token_embeddings = self.sess.run(sequence_output) + token_embeddings = self.sess.run(self.elmo_outputs, feed_dict=elmo_inputs)["elmo"] if not pooling: return token_embeddings diff --git a/embedding_as_service/text/encode.py b/embedding_as_service/text/encode.py index f1ea789..620e4d7 100644 --- a/embedding_as_service/text/encode.py +++ b/embedding_as_service/text/encode.py @@ -8,11 +8,12 @@ class Encoder(object, metaclass=ArgSingleton): - def __init__(self, embedding: str, model: str, download: bool = False): + def __init__(self, embedding: str, model: str, max_seq_length: int = 128): self.embedding = embedding self.model = model self.embedding_model_dict = None self.model_path = None + self.max_seq_length = max_seq_length supported_embeddings = self.get_supported_embeddings() @@ -30,7 +31,7 @@ def __init__(self, embedding: str, model: str, download: bool = False): raise ValueError(f"Given embedding \"{embedding}\" does not have support for model \"{model}\", " f"the supported models are: {model_names}") - self.model_path = self._get_or_download_model(download) + self.model_path = self._get_or_download_model(download=True) if not self.model_path: print(f"Model does not exits, pass download param as True") return @@ -89,7 +90,7 @@ def _get_or_download_model(self, download: bool) -> Optional[str]: return model_path def _load_model(self): - self.embedding_cls.load_model(self.model, self.model_path) + self.embedding_cls.load_model(self.model, self.model_path, self.max_seq_length) return def tokenize(self, texts: Union[List[str], str]) -> np.array: @@ -106,7 +107,6 @@ def tokenize(self, texts: Union[List[str], str]) -> np.array: def encode(self, texts: Union[List[str], List[List[str]]], pooling: Optional[str] = None, - max_seq_length: Optional[int] = 128, is_tokenized: bool = False, batch_size: int = 128, ** kwargs @@ -115,12 +115,12 @@ def encode(self, raise ValueError('Argument `texts` should be either List[str] or List[List[str]]') if is_tokenized: if not all(isinstance(text, list) for text in texts): - raise ValueError('Argument `texts` should be List[List[str]] (list of tokens) when `is_tokenized` = True') + raise ValueError('Argument `texts` should be List[List[str]] (list of tokens) ' + 'when `is_tokenized` = True') embeddings = [] for i in range(0, len(texts), batch_size): vectors = self.embedding_cls.encode(texts=texts[i: i + batch_size], pooling=pooling, - max_seq_length=max_seq_length, is_tokenized=is_tokenized) embeddings.append(vectors) embeddings = np.vstack(embeddings) diff --git a/embedding_as_service/text/fasttext/__init__.py b/embedding_as_service/text/fasttext/__init__.py index ed289ef..1f7c4bc 100644 --- a/embedding_as_service/text/fasttext/__init__.py +++ b/embedding_as_service/text/fasttext/__init__.py @@ -61,12 +61,13 @@ class Embeddings(object): def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model_name = None + self.max_seq_length = None @classmethod def tokenize(cls, text): return [x.lower().strip() for x in text.split()] - def load_model(self, model: str, model_path: str): + def load_model(self, model: str, model_path: str, max_seq_length: int): try: model_file = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))] f = open(os.path.join(model_path, model_file[0]), 'r') @@ -77,14 +78,15 @@ def load_model(self, model: str, model_path: str): self.word_vectors[word] = np.array([float(val) for val in split_line[1:]]) print("Model loaded Successfully !") self.model_name = model + self.max_seq_length = max_seq_length return self except Exception as e: print('Error loading Model, ', str(e)) return self - def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, max_seq_length: int, + def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, is_tokenized: bool): - + max_seq_length = self.max_seq_length tokens = text if not is_tokenized: tokens = Embeddings.tokenize(text) @@ -96,12 +98,11 @@ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32") - token_embeddings = np.array([self._single_encode_text(text, oov_vector, max_seq_length, is_tokenized) + token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized) for text in texts]) if not pooling: diff --git a/embedding_as_service/text/glove/__init__.py b/embedding_as_service/text/glove/__init__.py index 77c94a2..f60c06d 100644 --- a/embedding_as_service/text/glove/__init__.py +++ b/embedding_as_service/text/glove/__init__.py @@ -114,12 +114,13 @@ class Embeddings(object): def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model_name = None + self.max_seq_length = None @classmethod def tokenize(cls, text: str) -> List[str]: return [x.lower().strip() for x in text.split()] - def load_model(self, model: str, model_path: str): + def load_model(self, model: str, model_path: str, max_seq_length: int): try: model_file = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))] f = open(os.path.join(model_path, model_file[0]), 'r') @@ -129,14 +130,15 @@ def load_model(self, model: str, model_path: str): self.word_vectors[word] = np.array([float(val) for val in split_line[1:]]) print("Model loaded Successfully !") self.model_name = model + self.max_seq_length = max_seq_length return self except Exception as e: print('Error loading Model, ', str(e)) return self - def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, max_seq_length: int, + def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, is_tokenized: bool): - + max_seq_length = self.max_seq_length tokens = text if not is_tokenized: tokens = Embeddings.tokenize(text) @@ -148,12 +150,11 @@ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32") - token_embeddings = np.array([self._single_encode_text(text, oov_vector, max_seq_length, is_tokenized) + token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized) for text in texts]) if not pooling: diff --git a/embedding_as_service/text/ulmfit/__init__.py b/embedding_as_service/text/ulmfit/__init__.py index db08c8a..c09b2aa 100644 --- a/embedding_as_service/text/ulmfit/__init__.py +++ b/embedding_as_service/text/ulmfit/__init__.py @@ -67,7 +67,6 @@ def load_model(self, model: str, model_path: str): def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: diff --git a/embedding_as_service/text/use/__init__.py b/embedding_as_service/text/use/__init__.py index 490a8e6..f62fccc 100644 --- a/embedding_as_service/text/use/__init__.py +++ b/embedding_as_service/text/use/__init__.py @@ -4,6 +4,7 @@ from embedding_as_service.text import Embedding import tensorflow as tf import tensorflow_hub as hub +import sentencepiece as spm class Embeddings(object): @@ -44,18 +45,70 @@ class Embeddings(object): def __init__(self): self.sess = tf.Session() self.sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) - self.use_module = None + self.use_outputs = None self.model_name = None + self.max_seq_length = None + + # placeholder for dan and large model + self.sentences = None + + # sentencepiece and place holder model for lite version + self.sp_model = spm.SentencePieceProcessor() + self.input_placeholder = None + + def process_to_ids_in_sparse_format(self, sentences): + # An utility method that processes sentences with the sentence piece processor + # 'sp' and returns the results in tf.SparseTensor-similar format: + # (values, indices, dense_shape) + ids = [self.sp_model.EncodeAsIds(x) for x in sentences] + max_len = max(len(x) for x in ids) + dense_shape = (len(ids), max_len) + values = [item for sublist in ids for item in sublist] + indices = [[row, col] for row in range(len(ids)) for col in range(len(ids[row]))] + return values, indices, dense_shape + + def load_model(self, model: str, model_path: str, max_seq_length: int): + spm_path_info = None + g = tf.Graph() + with g.as_default(): + hub_module = hub.Module(model_path) + if model == 'use_transformer_lite': + self.input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None]) + self.use_outputs = hub_module( + inputs=dict( + values=self.input_placeholder.values, + indices=self.input_placeholder.indices, + dense_shape=self.input_placeholder.dense_shape) + ) + spm_path_info = hub_module(signature="spm_path") + else: + self.sentences = tf.placeholder(tf.string, shape=[None]) + self.use_outputs = hub_module(self.sentences, as_dict=True) + init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()]) + + g.finalize() + self.sess = tf.Session(graph=g) + self.sess.run(init_op) + + if model == 'use_transformer_lite': + spm_path = self.sess.run(spm_path_info) + self.sp_model.Load(spm_path) - def load_model(self, model: str, model_path: str): - self.use_module = hub.Module(model_path) - self.sess.run(tf.initializers.global_variables()) self.model_name = model + self.max_seq_length = max_seq_length def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: - return self.sess.run(self.use_module(texts)) + if self.model_name == 'use_transformer_lite': + values, indices, dense_shape = self.process_to_ids_in_sparse_format(texts) + embeddings = self.sess.run(self.use_outputs, feed_dict={ + self.input_placeholder.values: values, + self.input_placeholder.indices: indices, + self.input_placeholder.dense_shape: dense_shape + }) + else: + embeddings = self.sess.run(self.use_outputs, feed_dict={self.sentences: texts})["default"] + return embeddings diff --git a/embedding_as_service/text/word2vec/__init__.py b/embedding_as_service/text/word2vec/__init__.py index 45f2306..55533e2 100644 --- a/embedding_as_service/text/word2vec/__init__.py +++ b/embedding_as_service/text/word2vec/__init__.py @@ -28,12 +28,13 @@ class Embeddings(object): def __init__(self): self.word_vectors: Dict[Any, Any] = {} self.model_name = None + self.max_seq_length = None @classmethod def tokenize(cls, text: str) -> List[str]: return [x.lower().strip() for x in text.split()] - def load_model(self, model: str, model_path: str): + def load_model(self, model: str, model_path: str, max_seq_length: int): try: encoding = 'utf-8' unicode_errors = 'strict' @@ -61,14 +62,15 @@ def load_model(self, model: str, model_path: str): self.word_vectors[word] = weights self.model_name = model + self.max_seq_length = max_seq_length print("Model loaded Successfully !") return self except Exception as e: print('Error loading Model, ', str(e)) - def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, max_seq_length: int, + def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, is_tokenized: bool): - + max_seq_length = self.max_seq_length tokens = text if not is_tokenized: tokens = Embeddings.tokenize(text) @@ -80,12 +82,11 @@ def _single_encode_text(self, text: Union[str, List[str]], oov_vector: np.array, def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: oov_vector = np.zeros(Embeddings.EMBEDDING_MODELS[self.model_name].dimensions, dtype="float32") - token_embeddings = np.array([self._single_encode_text(text, oov_vector, max_seq_length, is_tokenized) + token_embeddings = np.array([self._single_encode_text(text, oov_vector, is_tokenized) for text in texts]) if not pooling: diff --git a/embedding_as_service/text/xlnet/__init__.py b/embedding_as_service/text/xlnet/__init__.py index ac93822..163c7ab 100644 --- a/embedding_as_service/text/xlnet/__init__.py +++ b/embedding_as_service/text/xlnet/__init__.py @@ -58,6 +58,7 @@ def __init__(self): self.xlnet_config = None self.run_config = None self.model_name = None + self.max_seq_length = None self.sess = tf.Session() @staticmethod @@ -72,10 +73,9 @@ def tokenize(cls, text): text = preprocess_text(text, lower=False) return encode_pieces(cls.tokenizer, text) - @staticmethod - def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tokenized: bool + def _model_single_input(self, text: Union[str, List[str]], is_tokenized: bool ) -> Tuple[List[int], List[int], List[int]]: - + max_seq_length = self.max_seq_length tokens_a = text if not is_tokenized: tokens_a = Embeddings.tokenize(text) @@ -115,24 +115,24 @@ def _model_single_input(text: Union[str, List[str]], max_seq_length: int, is_tok return input_ids, input_mask, segment_ids - def load_model(self, model: str, model_path: str): + def load_model(self, model: str, model_path: str, max_seq_length: int): model_path = os.path.join(model_path, next(os.walk(model_path))[1][0]) self.xlnet_config = xlnet.XLNetConfig(json_path=os.path.join(model_path, Embeddings.mode_config_path)) self.run_config = xlnet.create_run_config(is_training=True, is_finetune=True, FLAGS=Flags) self.load_tokenizer(model_path) + self.max_seq_length = max_seq_length self.model_name = model print("Model loaded Successfully !") def encode(self, texts: Union[List[str], List[List[str]]], pooling: str, - max_seq_length: int, is_tokenized: bool = False, **kwargs ) -> Optional[np.array]: input_ids, input_masks, segment_ids = [], [], [] for text in tqdm(texts, desc="Converting texts to features"): - input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length, is_tokenized) + input_id, input_mask, segment_id = self._model_single_input(text, is_tokenized) input_ids.append(input_id) input_masks.append(input_mask) segment_ids.append(segment_id) diff --git a/requirements.txt b/requirements.txt index b1730ec..6bfd5b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ smart_open==1.8.4 sentencepiece==0.1.82 tensorflow==1.15.0 setuptools>=41.0.0 +sentencepiece==0.1.85