Skip to content

Commit

Permalink
added more vectorization
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed Nov 11, 2020
1 parent e24c2ae commit d98013b
Show file tree
Hide file tree
Showing 15 changed files with 2,852 additions and 288 deletions.
730 changes: 730 additions & 0 deletions load-keyword-extraction.ipynb

Large diffs are not rendered by default.

872 changes: 872 additions & 0 deletions load-similarity.ipynb

Large diffs are not rendered by default.

895 changes: 895 additions & 0 deletions load-zeroshot-classification.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions malaya/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@


home = os.path.join(str(Path.home()), 'Malaya')
version = '3.9'
bump_version = '3.9.2'
version = '4.0'
bump_version = '4.0'
version_path = os.path.join(home, 'version')
__version__ = bump_version
path = os.path.dirname(__file__)
Expand Down
71 changes: 68 additions & 3 deletions malaya/model/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,16 +729,16 @@ def __init__(
segment_ids = segment_ids,
input_masks = input_masks,
logits = logits,
vectorizer = vectorizer,
sess = sess,
tokenizer = tokenizer,
label = label,
)
self._vectorizer = vectorizer
self._softmax = tf.nn.softmax(self._logits)
self._batch_size = 20

def _base(self, strings_left, strings_right):
input_ids, input_masks, segment_ids = bert_tokenization_siamese(
input_ids, input_masks, segment_ids, _ = bert_tokenization_siamese(
self._tokenizer, strings_left, strings_right
)

Expand Down Expand Up @@ -1066,6 +1066,7 @@ def __init__(
segment_ids,
input_masks,
logits,
vectorizer,
sess,
tokenizer,
label = ['not similar', 'similar'],
Expand All @@ -1076,6 +1077,7 @@ def __init__(
segment_ids = segment_ids,
input_masks = input_masks,
logits = logits,
vectorizer = vectorizer,
sess = sess,
tokenizer = tokenizer,
label = label,
Expand All @@ -1092,7 +1094,7 @@ def _base(self, strings, labels):
mapping[no].append(index)
index += 1

input_ids, input_masks, segment_ids = bert_tokenization_siamese(
input_ids, input_masks, segment_ids, _ = bert_tokenization_siamese(
self._tokenizer, strings_left, strings_right
)

Expand All @@ -1113,6 +1115,69 @@ def _base(self, strings, labels):
results.append(result)
return results

@check_type
def vectorize(
self, strings: List[str], labels: List[str], method: str = 'first'
):
"""
vectorize a string.
Parameters
----------
strings: List[str]
labels : List[str]
method : str, optional (default='first')
Vectorization layer supported. Allowed values:
* ``'last'`` - vector from last sequence.
* ``'first'`` - vector from first sequence.
* ``'mean'`` - average vectors from all sequences.
* ``'word'`` - average vectors based on tokens.
Returns
-------
result: np.array
"""
strings_left, strings_right, combined = [], [], []
for no, string in enumerate(strings):
for label in labels:
strings_left.append(string)
strings_right.append(f'teks ini adalah mengenai {label}')
combined.append((string, label))

input_ids, input_masks, segment_ids, s_tokens = bert_tokenization_siamese(
self._tokenizer, strings_left, strings_right
)

v = self._sess.run(
self._vectorizer,
feed_dict = {
self._X: input_ids,
self._segment_ids: segment_ids,
self._input_masks: input_masks,
},
)
if len(v.shape) == 2:
v = v.reshape((*np.array(input_ids).shape, -1))

if method == 'first':
v = v[:, 0]
elif method == 'last':
v = v[:, -1]
elif method == 'mean':
v = np.mean(v, axis = 1)
else:
v = [
merge_sentencepiece_tokens(
list(zip(s_tokens[i], v[i][: len(s_tokens[i])])),
weighted = False,
vectorize = True,
)
for i in range(len(v))
]
return combined, v

@check_type
def predict_proba(self, strings: List[str], labels: List[str]):
"""
Expand Down
86 changes: 80 additions & 6 deletions malaya/model/xlnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,24 +752,22 @@ def __init__(
X = X,
segment_ids = segment_ids,
input_masks = input_masks,
vectorizer = vectorizer,
logits = logits,
sess = sess,
tokenizer = tokenizer,
label = label,
)
self._vectorizer = vectorizer
self._softmax = tf.nn.softmax(self._logits)
self._batch_size = 20

def _base(self, strings_left, strings_right):
input_ids, input_masks, segment_ids = xlnet_tokenization_siamese(
input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese(
self._tokenizer, strings_left, strings_right
)
segment_ids = np.array(segment_ids)
batch_segment[batch_segment == 0] = 1

return self._sess.run(
self._vectorizer,
self._softmax,
feed_dict = {
self._X: input_ids,
self._segment_ids: segment_ids,
Expand All @@ -793,6 +791,16 @@ def vectorize(self, strings: List[str]):
input_ids, input_masks, segment_ids, _ = xlnet_tokenization(
self._tokenizer, strings
)
segment_ids = np.array(segment_ids)
segment_ids[segment_ids == 0] = 1
return self._sess.run(
self._vectorizer,
feed_dict = {
self._X: input_ids,
self._segment_ids: segment_ids,
self._input_masks: input_masks,
},
)

@check_type
def predict_proba(self, strings_left: List[str], strings_right: List[str]):
Expand Down Expand Up @@ -1111,6 +1119,7 @@ def __init__(
segment_ids,
input_masks,
logits,
vectorizer,
sess,
tokenizer,
label = ['not similar', 'similar'],
Expand All @@ -1121,6 +1130,7 @@ def __init__(
segment_ids = segment_ids,
input_masks = input_masks,
logits = logits,
vectorizer = vectorizer,
sess = sess,
tokenizer = tokenizer,
label = label,
Expand All @@ -1138,7 +1148,7 @@ def _base(self, strings, labels):
mapping[no].append(index)
index += 1

input_ids, input_masks, segment_ids = xlnet_tokenization_siamese(
input_ids, input_masks, segment_ids, _ = xlnet_tokenization_siamese(
self._tokenizer, strings_left, strings_right
)

Expand All @@ -1159,6 +1169,70 @@ def _base(self, strings, labels):
results.append(result)
return results

@check_type
def vectorize(
self, strings: List[str], labels: List[str], method: str = 'first'
):
"""
vectorize a string.
Parameters
----------
strings: List[str]
labels : List[str]
method : str, optional (default='first')
Vectorization layer supported. Allowed values:
* ``'last'`` - vector from last sequence.
* ``'first'`` - vector from first sequence.
* ``'mean'`` - average vectors from all sequences.
* ``'word'`` - average vectors based on tokens.
Returns
-------
result: np.array
"""

strings_left, strings_right, combined = [], [], []
for no, string in enumerate(strings):
for label in labels:
strings_left.append(string)
strings_right.append(f'teks ini adalah mengenai {label}')
combined.append((string, label))

input_ids, input_masks, segment_ids, s_tokens = xlnet_tokenization_siamese(
self._tokenizer, strings_left, strings_right
)

v = self._sess.run(
self._vectorizer,
feed_dict = {
self._X: input_ids,
self._segment_ids: segment_ids,
self._input_masks: input_masks,
},
)
v = np.transpose(v, [1, 0, 2])

if method == 'first':
v = v[:, 0]
elif method == 'last':
v = v[:, -1]
elif method == 'mean':
v = np.mean(v, axis = 1)
else:
v = [
merge_sentencepiece_tokens(
list(zip(s_tokens[i], v[i][: len(s_tokens[i])])),
weighted = False,
vectorize = True,
model = 'xlnet',
)
for i in range(len(v))
]
return combined, v

@check_type
def predict_proba(self, strings: List[str], labels: List[str]):
"""
Expand Down
14 changes: 13 additions & 1 deletion malaya/path/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@
},
'alxlnet': {
'model': 'v34/emotion/alxlnet-base-emotion.pb',
'quantized': 'v34/emotion/alxlnet-base-emotion.pb.quantized',
'quantized': 'v40/emotion/alxlnet-base-emotion.pb.quantized',
'vocab': 'tokenizer/sp10m.cased.v9.vocab',
'tokenizer': 'tokenizer/sp10m.cased.v9.model',
},
Expand Down Expand Up @@ -857,36 +857,42 @@
PATH_SIMILARITY = {
'bert': {
'model': home + '/similarity/bert/base/model.pb',
'quantized': home + '/similarity/bert/base/quantized/model.pb',
'vocab': home + '/bert/sp10m.cased.bert.vocab',
'tokenizer': home + '/bert/sp10m.cased.bert.model',
'version': 'v36',
},
'tiny-bert': {
'model': home + '/similarity/bert/tiny/model.pb',
'quantized': home + '/similarity/bert/tiny/quantized/model.pb',
'vocab': home + '/bert/sp10m.cased.bert.vocab',
'tokenizer': home + '/bert/sp10m.cased.bert.model',
'version': 'v36',
},
'albert': {
'model': home + '/similarity/albert/base/model.pb',
'quantized': home + '/similarity/albert/base/quantized/model.pb',
'vocab': home + '/albert/sp10m.cased.v10.vocab',
'tokenizer': home + '/albert/sp10m.cased.v10.model',
'version': 'v36',
},
'tiny-albert': {
'model': home + '/similarity/albert/tiny/model.pb',
'quantized': home + '/similarity/albert/tiny/quantized/model.pb',
'vocab': home + '/bert/sp10m.cased.bert.vocab',
'tokenizer': home + '/bert/sp10m.cased.bert.model',
'version': 'v36',
},
'xlnet': {
'model': home + '/similarity/xlnet/base/model.pb',
'quantized': home + '/similarity/xlnet/base/quantized/model.pb',
'vocab': home + '/xlnet/sp10m.cased.v9.vocab',
'tokenizer': home + '/xlnet/sp10m.cased.v9.model',
'version': 'v36',
},
'alxlnet': {
'model': home + '/similarity/alxlnet/base/model.pb',
'quantized': home + '/similarity/alxlnet/base/quantized/model.pb',
'vocab': home + '/xlnet/sp10m.cased.v9.vocab',
'tokenizer': home + '/xlnet/sp10m.cased.v9.model',
'version': 'v36',
Expand All @@ -896,31 +902,37 @@
S3_PATH_SIMILARITY = {
'bert': {
'model': 'v36/similarity/bert-base-similarity.pb',
'quantized': 'v40/similarity/bert-base-similarity.pb.quantized',
'vocab': 'tokenizer/sp10m.cased.bert.vocab',
'tokenizer': 'tokenizer/sp10m.cased.bert.model',
},
'tiny-bert': {
'model': 'v36/similarity/tiny-bert-similarity.pb',
'quantized': 'v40/similarity/tiny-bert-similarity.pb.quantized',
'vocab': 'tokenizer/sp10m.cased.bert.vocab',
'tokenizer': 'tokenizer/sp10m.cased.bert.model',
},
'albert': {
'model': 'v36/similarity/albert-base-similarity.pb',
'quantized': 'v40/similarity/albert-base-similarity.pb.quantized',
'vocab': 'tokenizer/sp10m.cased.v10.vocab',
'tokenizer': 'tokenizer/sp10m.cased.v10.model',
},
'tiny-albert': {
'model': 'v36/similarity/albert-tiny-similarity.pb',
'quantized': 'v40/similarity/albert-tiny-similarity.pb.quantized',
'vocab': 'tokenizer/sp10m.cased.v10.vocab',
'tokenizer': 'tokenizer/sp10m.cased.v10.model',
},
'xlnet': {
'model': 'v36/similarity/xlnet-base-similarity.pb',
'quantized': 'v40/similarity/xlnet-base-similarity.pb.quantized',
'vocab': 'tokenizer/sp10m.cased.v9.vocab',
'tokenizer': 'tokenizer/sp10m.cased.v9.model',
},
'alxlnet': {
'model': 'v36/similarity/alxlnet-base-similarity.pb',
'quantized': 'v40/similarity/alxlnet-base-similarity.pb.quantized',
'vocab': 'tokenizer/sp10m.cased.v9.vocab',
'tokenizer': 'tokenizer/sp10m.cased.v9.model',
},
Expand Down
Loading

0 comments on commit d98013b

Please sign in to comment.