-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathApp.py
245 lines (226 loc) · 10.6 KB
/
App.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""
Example of a Streamlit app for an interactive Prodigy dataset viewer that also lets you
run simple training experiments for NER and text classification.
Requires the Prodigy annotation tool to be installed: https://prodi.gy
See here for details on Streamlit: https://streamlit.io.
"""
import streamlit as st
from prodigy.components.db import connect
from prodigy.models.ner import EntityRecognizer, merge_spans, guess_batch_size
from prodigy.models.textcat import TextClassifier
from prodigy.util import split_evals
import pandas as pd
import spacy
from spacy import displacy
from spacy.util import filter_spans, minibatch
import random
SPACY_MODEL_NAMES = ["en_core_web_sm"]
EXC_FIELDS = ["meta", "priority", "score"]
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
COLOR_ACCEPT = "#93eaa1"
COLOR_REJECT = "#ff8f8e"
def guess_dataset_type(first_eg):
if "image" in first_eg:
return "image"
if "arc" in first_eg:
return "dep"
if "options" in first_eg or "label" in first_eg:
return "textcat"
if "spans" in first_eg:
return "ner"
return "other"
def get_answer_counts(examples):
result = {"accept": 0, "reject": 0, "ignore": 0}
for eg in examples:
answer = eg.get("answer")
if answer:
result[answer] += 1
return result
def format_label(label, answer="accept"):
# Hack to use different colors for the label (by adding zero-width space)
return f"{label}\u200B" if answer == "reject" else label
st.sidebar.title("Prodigy Data Explorer")
db = connect()
db_sets = db.datasets
placeholder = "Select dataset..."
dataset = st.sidebar.selectbox(f"Datasets ({len(db_sets)})", [placeholder] + db_sets)
if dataset != placeholder:
examples = db.get_dataset(dataset)
st.header(f"{dataset} ({len(examples)})")
if not len(examples):
st.markdown("_Empty dataset._")
else:
counts = get_answer_counts(examples)
st.markdown(", ".join(f"**{c}** {a}" for a, c in counts.items()))
dataset_types = ["ner", "textcat", "dep", "pos", "image", "other"]
guessed_index = dataset_types.index(guess_dataset_type(examples[0]))
set_type = st.sidebar.selectbox("Dataset type", dataset_types, guessed_index)
fields = list(examples[0].keys())
default_fields = [f for f in fields if f[0] != "_" and f not in EXC_FIELDS]
task_fields = st.sidebar.multiselect("Visible fields", fields, default_fields)
st.dataframe(pd.DataFrame(examples).filter(task_fields), height=500)
if set_type in ["ner", "textcat"]:
st.sidebar.header("Viewer options")
purpose = "tokenization & training" if set_type == "ner" else "training"
spacy_model_title = f"spaCy model for {purpose}"
spacy_model = st.sidebar.selectbox(spacy_model_title, SPACY_MODEL_NAMES)
st.sidebar.subheader("Training configuration")
n_iter = st.sidebar.slider("Number of iterations", 1, 100, 5, 1)
dropout = st.sidebar.slider("Dropout rate", 0.0, 1.0, 0.2, 0.05)
eval_split_label = "% of examples held back for evaluation"
eval_split = st.sidebar.slider(eval_split_label, 0.0, 1.0, 0.2, 0.05)
if set_type == "ner":
st.subheader("Named entity viewer")
nlp = spacy.load(spacy_model)
merged_examples = merge_spans(list(examples))
all_labels = set()
for eg in merged_examples:
for span in eg["spans"]:
all_labels.add(span["label"])
colors = {}
for label in all_labels:
colors[label] = COLOR_ACCEPT
colors[format_label(label, "reject")] = COLOR_REJECT
ner_example_i = st.selectbox(
f"Merged examples ({len(merged_examples)})",
range(len(merged_examples)),
format_func=lambda i: merged_examples[int(i)]["text"][:400],
)
ner_example = merged_examples[int(ner_example_i)]
doc = nlp.make_doc(ner_example["text"])
ents = []
for span in ner_example.get("spans", []):
label = format_label(span["label"], span["answer"])
ents.append(doc.char_span(span["start"], span["end"], label=label))
doc.ents = filter_spans(ents)
html = displacy.render(doc, style="ent", options={"colors": colors})
html = html.replace("\n", " ") # Newlines seem to mess with the rendering
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
show_ner_example_json = st.checkbox("Show JSON example")
if show_ner_example_json:
st.json(ner_example)
st.subheader("Train a model (experimental)")
no_missing = st.checkbox(
"Data is gold-standard and contains no missing values", False
)
start_blank = st.checkbox("Start with blank NER model", True)
if st.button("🚀 Start training"):
if start_blank:
ner = nlp.create_pipe("ner")
if "ner" in nlp.pipe_names:
nlp.replace_pipe("ner", ner)
else:
nlp.add_pipe(ner)
ner.begin_training([])
else:
ner = nlp.get_pipe("ner")
for label in all_labels:
ner.add_label(label)
random.shuffle(examples)
train_examples, evals, eval_split = split_evals(
merged_examples, eval_split
)
st.success(
f"✅ Using **{len(train_examples)}** training examples "
f"and **{len(evals)}** evaluation examples with "
f"**{len(all_labels)}** label(s)"
)
annot_model = EntityRecognizer(
nlp, label=all_labels, no_missing=no_missing
)
batch_size = guess_batch_size(len(train_examples))
baseline = annot_model.evaluate(evals)
st.info(
f"ℹ️ **Baseline**\n**{baseline['right']:.0f}** right "
f"entities, **{baseline['wrong']:.0f}** wrong entities, "
f"**{baseline['unk']:.0f}** unkown entities, "
f"**{baseline['ents']:.0f}** total predicted, "
f"**{baseline['acc']:.2f}** accuracy"
)
progress = st.progress(0)
results = []
result_table = st.empty()
best_acc = 0.0
for i in range(n_iter):
random.shuffle(train_examples)
losses = annot_model.batch_train(
train_examples,
batch_size=batch_size,
drop=dropout,
beam_width=16,
)
stats = annot_model.evaluate(evals)
stats = {
"Right": stats["right"],
"Wrong": stats["wrong"],
"Unknown": stats["unk"],
"Predicted Ents": stats["ents"],
"Loss": losses["ner"],
"Accuracy": round(stats["acc"], 3),
}
best_acc = (
stats["Accuracy"] if stats["Accuracy"] > best_acc else best_acc
)
def highlight(v):
is_best = v != 0 and v == best_acc
return f"background: {'yellow' if is_best else 'white'}"
results.append(stats)
results_df = pd.DataFrame(results, dtype="float")
result_table.dataframe(results_df.style.applymap(highlight))
progress.progress(int((i + 1) / n_iter * 100))
elif set_type == "textcat":
st.subheader("Train a model (experimental)")
exclusive = st.checkbox("Labels are mututally exclusive", False)
if st.button("🚀 Start training"):
nlp = spacy.load(spacy_model)
examples = list(examples)
all_labels = set()
for eg in examples:
all_labels.update(eg.get("accelt", []))
if "label" in eg:
all_labels.add(eg["label"])
textcat = nlp.create_pipe("textcat")
for label in all_labels:
textcat.add_label(label)
textcat.begin_training()
nlp.add_pipe(textcat)
random.shuffle(examples)
train_examples, evals, eval_split = split_evals(examples, eval_split)
st.success(
f"✅ Using **{len(train_examples)}** training examples "
f"and **{len(evals)}** evaluation examples with "
f"**{len(all_labels)}** label(s)"
)
annot_model = TextClassifier(
nlp,
all_labels,
low_data=len(train_examples) < 1000,
exclusive_classes=exclusive,
)
progress = st.progress(0)
results = []
result_table = st.empty()
best_acc = 0.0
for i in range(n_iter):
loss = 0.0
random.shuffle(train_examples)
for batch in minibatch(train_examples, size=10):
batch = list(batch)
loss += annot_model.update(batch, revise=False, drop=dropout)
with nlp.use_params(annot_model.optimizer.averages):
stats = annot_model.evaluate(evals)
stats = {
"Loss": loss,
"F-Score": stats["fscore"],
"Accuracy": round(stats["accuracy"], 3),
}
best_acc = (
stats["Accuracy"] if stats["Accuracy"] > best_acc else best_acc
)
def highlight(v):
is_best = v != 0 and v == best_acc
return f"background: {'yellow' if is_best else 'white'}"
results.append(stats)
results_df = pd.DataFrame(results, dtype="float").round(3)
result_table.dataframe(results_df.style.applymap(highlight))
progress.progress(int((i + 1) / n_iter * 100))