-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
235 lines (196 loc) · 12.3 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import gradio as gr
import uuid, os
import stylo_app
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
css = """
h1 {
display: block;
text-align: center;
font-size: 32pt;
}
.progress-bar-wrap.progress-bar-wrap.progress-bar-wrap
{
border-radius: var(--input-radius);
height: 1.25rem;
margin-top: 1rem;
overflow: hidden;
width: 70%;
}
#submit-button:hover {
background-color: blue !important; /* Force red background on hover */
}
#cancel-button:hover {
background-color: red !important; /* Force red background on hover */
}
"""
theme = gr.themes.Soft(
primary_hue="indigo",
secondary_hue="amber",
).set(
button_primary_background_fill='*secondary_500',
button_primary_background_fill_hover='*secondary_400',
block_label_background_fill='*primary_50',
)
def set_visibility():
"""
Sets the visibility of the output widgets to False
when initiating a new run.
"""
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
def generate_run_id():
run_id = str(uuid.uuid4())
return gr.update(value=run_id, visible=True)
def show_input(input_type):
"""
Used to control which input widget is being shown.
"""
if input_type == "Corpus":
return gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True)
def show_sttr_span_textbox(metric):
"""
Used to toggle token span size parameter when STTR is selected as diversity metric.
"""
if metric == "STTR":
return gr.update(visible=True)
else:
return gr.update(visible=False)
def visible_output():
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
def visible_plots(error_or_canceled):
if bool(error_or_canceled):
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
else:
return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
with gr.Blocks(title="Styloscope", theme=theme, css=css) as demo:
title = gr.Markdown("""# Styloscope""")
with gr.Tab("Pipeline"):
# components
with gr.Row():
gr.Markdown("### Provide input data")
with gr.Row(variant='panel'):
input_type = gr.Radio(choices=['Corpus', 'HuggingFace dataset'], value='Corpus', label='Type', interactive=True,
info="""Upload your own corpus or download a public dataset from the HuggingFace hub.""")
with gr.Row(variant='panel'):
with gr.Column(visible=True) as corpus_widget:
file = gr.File(file_types = ['.csv', '.zip'], file_count = "single")
with gr.Column(visible=False) as hf_widget:
dataset = gr.Textbox(label="Name", info="Dataset identifier mentioned on HuggingFace.")
subset = gr.Textbox(label="Subset", info="Mandatory if dataset contains subsets.")
split = gr.Textbox(label="Split", info="Mandatory if dataset contains splits.")
with gr.Row(variant='panel'):
column_name = gr.Textbox(label="Column", info="Column name (for .csv / Huggingface) that will be used for writing style analysis.")
input_type.change(
show_input, input_type, [corpus_widget, hf_widget]
)
with gr.Row():
gr.Markdown("### Set pipeline parameters")
with gr.Row(variant='panel'):
lang = gr.Dropdown(["Dutch", "English", "French", "German"], label="Language", value="Dutch", interactive=True)
readability = gr.Dropdown(["ARI", "Coleman-Liau", "Flesch reading ease", "Flesch Kincaid grade level", "Gunning Fog", "SMOG", "LIX", "RIX"], label="Readability metric", value="RIX", interactive=True)
diversity = gr.Dropdown(["STTR", "TTR", "RTTR", "CTTR", "Herdan", "Summer", "Dugast", "Maas"], label="Lexical diversity metric", value="STTR", interactive=True)
span_size = gr.Textbox(label='STTR token span', value=100, visible=True)
diversity.change(
show_sttr_span_textbox, diversity, [span_size]
)
with gr.Row(variant="Panel"):
button = gr.Button('Submit', variant='primary')
# outputs
run_id = gr.Textbox(label='Run index', info="", visible=False, interactive=False)
zip_out = gr.File(label='Output', visible=False)
basic_statistics = gr.Dataframe(headers=['Corpus statistics', 'Mean', 'Std.'], visible=False)
dep_plot = gr.Plot(label='Distribution of syntactic dependencies', show_label=True, visible=False)
pos_plot = gr.Plot(label='Distribution of part-of-speech tags', show_label=True, visible=False)
punct_plot = gr.Plot(label='Distribution of punctuation marks', show_label=True, visible=False)
len_plot = gr.Plot(label='Distribution of word lengths', show_label=True, visible=False)
error_or_canceled = gr.Textbox(value="False", interactive=True, visible=False)
# flag that keeps track of whether there was an error or cancel instruction
# used to regulate visibility of widgets
# must be a gradio component since used as input in function triggered by button click
with gr.Row(variant="Panel"):
cancel_button = gr.Button('Cancel', variant='primary', elem_id='cancel-button', visible=False, interactive=True)
visibility_event = button.click( # first set visibility of the widgets
set_visibility,
outputs=[cancel_button, run_id, zip_out, basic_statistics, dep_plot, pos_plot, punct_plot, len_plot],
trigger_mode="once",
)
id_event = visibility_event.then( # then generate and show run index
generate_run_id,
outputs=run_id,
)
output_event = id_event.then( # then make zip output component visible (so that progress bar is visible)
visible_output,
outputs=[zip_out, dep_plot, pos_plot, punct_plot, len_plot]
)
pipe_event = output_event.then( # then run pipeline
stylo_app.main,
inputs=[input_type, file, dataset, subset, split, column_name, lang, readability, diversity, span_size, run_id],
outputs=[zip_out, basic_statistics, dep_plot, pos_plot, punct_plot, len_plot, error_or_canceled]
)
plots_event = pipe_event.then( # then make plots visible
visible_plots,
inputs=[error_or_canceled],
outputs=[basic_statistics, dep_plot, pos_plot, punct_plot, len_plot, run_id, cancel_button]
)
cancel_button.click(stylo_app.stop_function, outputs=[cancel_button, run_id])
with gr.Tab("Dutch authorship attribution demo"):
gr.load("clips/xlm-roberta-text-genre-dutch", src="models", title="", description="**Text genre prediction**")
gr.load("clips/robbert-2023-dutch-base-gender", src="models", title="", description="**Gender prediction**")
with gr.Tab("User guidelines"):
gr.Markdown("""### Input""")
gr.Markdown("""The input of the pipeline must be either a corpus or a publicly available HuggingFace dataset.
Corpora can be uploaded as a .zip folder containing UTF8-encoded .txt files, or as a .csv file containing one document per row (documents must be placed under a column named "text", additional columns are allowed and do not affect the pipeline).
When using a HuggingFace dataset, you will be asked to specify the dataset identifier, the text column on which an analysis needs to be performed, and optionally the data subset and split of interest.
""")
gr.Markdown("""### Language""")
gr.Markdown("""The user must specify the language of the input corpus. Our tools currently supports analysis of Dutch, French, English, and German. Note that the tools used in this pipeline have been trained on standard, contemporary language, and will therefore perform best on this type of data. When using a multilingual corpus, it is best to split up the data per language and run the pipeline multiple times while selecting a different language per run.""")
gr.Markdown("""### Readability metrics""")
gr.Markdown(
"""
We recommend using RIX, because it is the most intuitive metric, and it is was not designed for a specific language or text genre.
| Metric | Formula | Language |
|--------------|--------------------------------------------------|--------|
| ARI | 4.71 * (characters / words) + 0.5 * (words / sentences) - 21.43 | English |
| Coleman-Liau* | 0.0588 * L - 0.296 * S - 15.8 | English;<br />Texts must be > 100 tokens |
| Flesch reading ease | 206.835 - 1.015 * (words / sentences) - 84.6 * (syllables / words) | English |
| Flesch Kincaid grade level | 11.8 * (syllables / words) + 0.39 * (words / sentences) - 15.59 | English |
| Gunning Fog** | 0.4 * (words / sentences + % complex words) | English;<br />Texts must be > 100 syllables |
| LIX*** | (words / sentences) + (100 * (long words / words)) | Language-independant |
| RIX*** | (long words / sentences) + (words / sentences) | Language-independant;<br />More interpretable version of LIX |
| SMOG** | sqrt(complex words) + 3 | English;<br />Orig. developed for clinical texts);<br />Texts must be > 30 sentences. |
*L = Average number of characters per 100 tokens<br /> S = Average number of sentences per 100 tokens
**Complex words = words that contain at least 3 syllables
***Long words = words longer than 6 characters
"""
)
gr.Markdown("""### Lexical diversity metrics""")
gr.Markdown("""
We recommend using the standardized type-token ratio (STTR), since it is less likely to be influenced by varying text lengths.
| Metric | Formula |
|--------|-------------------------------------------------|
| TTR | Number of unique words / Total number of words |
| STTR | Mean of TTR scores per n words (returns TTR if text < n words) |
| RTTR | Number of unique words / sqrt(Total number of words) |
| CTTR | Number of unique words / sqrt(2 * Total number of words) |
| Herdan | log(Number of unique words) / log(Total number of words) |
| Summer | log( log(Number of unique words) ) / log( log(Total number of words) ) |
| Dugast | ( log(Total number of words)**2) / ( log(Total number of words) - log(Number of unique words) ) |
| Maas | ( log(Total number of words) - log(Number of unique words) ) / log(Total number of words)**2 |
"""
)
with gr.Tab("About"):
gr.Markdown("""
### Project
This stylometry pipeline was developed by [CLiPS](https://www.uantwerpen.be/en/research-groups/clips/) ([University of Antwerp](https://www.uantwerpen.be/en/)) during the [CLARIAH-VL](https://clariahvl.hypotheses.org/) project.
### Contact
If you have questions, please send them to [Jens Lemmens](mailto:[email protected]) or [Walter Daelemans](mailto:[email protected])
""")
with gr.Row():
gr.Markdown("""<center><img src="https://platformdh.uantwerpen.be/wp-content/uploads/2019/03/clariah_def.png" alt="Image" width="200"/></center>""")
gr.Markdown("""<center><img src="https://thomasmore.be/sites/default/files/2022-11/UA-hor-1-nl-rgb.jpg" alt="Image" width="175"/></center>""")
demo.queue(default_concurrency_limit=10)
demo.launch(server_port=7860, share=True, server_name='0.0.0.0')