-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathharness.py
executable file
·134 lines (114 loc) · 4.3 KB
/
harness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
"""Simple BERT tokenization and embeddings example.
"""
__author__ = 'Paul Landes'
from typing import Tuple
from dataclasses import dataclass, field
import logging
from io import StringIO
from torch import Tensor
from zensols.cli import CliHarness
from zensols.config import ConfigFactory
from zensols.nlp import FeatureToken, FeatureSentence, FeatureDocument
from zensols.deepnlp.vectorize import (
FeatureVectorizer, FeatureVectorizerManager
)
from zensols.deepnlp.transformer import (
TokenizedFeatureDocument, WordPieceFeatureDocument,
WordPieceFeatureDocumentFactory,
TransformerDocumentTokenizer, TransformerEmbedding
)
logger = logging.getLogger(__name__)
CONFIG = """
[cli]
apps = list: log_cli, cleaner_cli, app
[package]
name = harness
[import]
references = list: package
sections = list: imp_obj
config_files = list:
resource(zensols.util): resources/default.conf,
resource(zensols.util): resources/escape.conf,
resource(zensols.util): resources/cli.conf,
resource(zensols.util): resources/cleaner.conf
[imp_obj]
type = importini
config_files = list:
resource(zensols.deeplearn): resources/default.conf,
resource(zensols.deepnlp): resources/default.conf,
resource(zensols.deeplearn): resources/obj.conf,
resource(zensols.nlp): resources/obj.conf,
resource(zensols.deepnlp): resources/obj.conf
[map_filter_token_normalizer]
embed_entities = False
[app]
class_name = ${package:name}.Application
"""
@dataclass
class Application(object):
"""The demo application entry point.
"""
CLI_META = {'option_excludes': {'config_factory'}}
config_factory: ConfigFactory = field()
"""Set by the framework and used to get vectorizers from the application
configuration.
"""
def traintest(self, write: str = 'wordpiece'):
"""Parse and vectorize a sentence in to BERT embeddings (the action
naming misnomer is unfortunately needed for the build automation).
:param write: what to output
"""
sents: str = """\
South Korea's Unification Minister Kwon Young-sesaid the North might postpone \
its nuclear test for some time. North Korea has also achieved some political \
effects by codifying its nuclear law in August.
"""
# create the vectorizers from the application config
vec_mng: FeatureVectorizerManager = self.config_factory(
'language_vectorizer_manager')
vec: FeatureVectorizer = vec_mng['transformer_fixed']
embed: TransformerEmbedding = vec.embed_model
tokenizer: TransformerDocumentTokenizer = vec.embed_model.tokenizer
# parse a feature document
fdoc: FeatureDocument = vec_mng.doc_parser.parse(sents.strip())
# show the tokenized document.
tdoc: TokenizedFeatureDocument = tokenizer.tokenize(fdoc)
tdoc_det: TokenizedFeatureDocument = tdoc.detach()
if write == 'tokenize':
tdoc.write()
tdoc_det.write()
elif write == 'wordpiece':
doc_fac: WordPieceFeatureDocumentFactory = self.config_factory(
'word_piece_doc_factory')
wpdoc: WordPieceFeatureDocument = doc_fac(fdoc, tdoc)
wpdoc.write()
elif write == 'map':
for m in tdoc.map_to_word_pieces(
fdoc, vec.embed_model.tokenizer.id2tok):
sent: FeatureSentence = m['sent']
print(sent)
n_wp: int = 0
tok: FeatureToken
wps: Tuple[str]
for tok, wps in m['map']:
print(' ', wps)
n_wp += len(wps)
print(f' word pieces: {n_wp}')
# functionally both are the same, but slightly faster to transform with
# tokenized document to duplicate tokenization work
if 1:
arr: Tensor = embed.transform(tdoc)
else:
arr: Tensor = vec.transform(fdoc)
# the tensor should match up with the max sentence word piece count but
# add the [CLS] and [SEP] tokens
print(f'tensor: {arr.shape}')
if (__name__ == '__main__'):
import zensols.deepnlp.transformer as t
t.suppress_warnings()
CliHarness(
app_config_resource=StringIO(CONFIG),
proto_args='traintest',
proto_factory_kwargs={'reload_pattern': '^harness'},
).run()