-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
254 lines (219 loc) · 9.99 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
from dash import Dash, html, dcc, callback, Input, Output, State, dash_table
import plotly.express as px
import pandas as pd
from scipy import spatial
from sentence_transformers import SentenceTransformer
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, \
MaxAbsScaler, QuantileTransformer, PowerTransformer, \
Normalizer, Binarizer
# Refactor back to MVP and push to git. Then spend time on the choices.
#### Data Import
quotes = pd.read_csv('wit_and_wisdoms.csv')
# Data options for language model, reducer, scaler
# Make sure import statements reflect all options
language_model_choices = {'all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
'all-MiniLM-L12-v2': 'all-MiniLM-L12-v2',
'all-distilroberta-v1': 'all-distilroberta-v1',
'all-mpnet-base-v2': 'all-mpnet-base-v2',
'paraphrase-albert-small-v2': 'paraphrase-albert-small-v2',
'paraphrase-MiniLM-L3-v2' : 'paraphrase-MiniLM-L3-v2'}
reducer_choices = {'UMAP': 'UMAP',
'PCA': 'PCA',
't-SNE':'t-SNE'}
scalers_choices = {'Standard Scaler':'StandardScaler',
'MinMax Scaler':'MinMaxScaler',
'Robust Scaler':'RobustScaler',
'Max Abs Scaler':'MaxAbsScaler',
'Quantile Transformer':'QuantileTransformer',
'Power Transformer':'PowerTransformer',
'Normalizer':'Normalizer',
'Binarizer':'Binarizer'}
# construct the sentence transformer
model = SentenceTransformer('all-MiniLM-L6-v2')
paragraphs = quotes['Quote'].tolist()
embeddings = model.encode(paragraphs)
# Scale and Reduce the data
reducer = umap.UMAP()
scaler = StandardScaler()
scaled_data = scaler.fit_transform(list(embeddings))
reduce_data = reducer.fit_transform(scaled_data)
quotes['Dimension_1'] = reduce_data[:, 0].tolist()
quotes['Dimension_2'] = reduce_data[:, 1].tolist()
quotes['id'] = quotes.index
#### Dash Components
# Main Figure
fig = px.scatter(
quotes,
x="Dimension_1",
y="Dimension_2",
color="Date",
color_continuous_scale = px.colors.sequential.Turbo,
custom_data=['id'],
labels={'Dimension_1': '', 'Dimension_2': ''},
hover_name='Quote',
hover_data={'Date': True, 'Dimension_1': False, 'Dimension_2': False},
template='simple_white'
).update_layout(dragmode='select').update_xaxes(visible=False, showticklabels=False).update_yaxes(visible=False,showticklabels=False)
# Graph component of embedded quotes
graph_comp = dcc.Graph(
id='scatter',
figure = fig,
config = {
'modeBarButtonsToRemove': ['pan','autoScale','zoomIn','zoomOut'],
'displaylogo': False}
)
# Table showing selected data
table_comp = dash_table.DataTable(
id='table-comp',
columns=[{"name": i, "id": i} for i in quotes.columns if i in ['Quote', 'Date']],
data = quotes.to_dict('records'),
sort_action='native',
style_cell={'textAlign': 'left'},
),
#### Dash App
app = Dash(__name__)
app.layout = html.Div([
html.H1('Wit, Wisdom, & Weights'),
html.H3('A Visual Exploration of Poor Richard\'s Almanack'),
html.H5('Wit: James Welch | Wisdom: Benjamin Franklin'),
dcc.Markdown('''
From 1732 to 1758, Ben Franklin published "Poor Richard's Almanack," an influential publication \
filled with farming information, calendars, poems, and proverbs offering insights into human nature and morality.\
I was gifted a condensed guide with these proverbs grouped togeather by themes such as "Deceit and Trust", "Folly", and "Talking and Silence".\
I thought it would be interesting to group these sayings using vector embeddings and dimension reduction to generate visualizations of the groupings.
I was inspired and guided by this [project from ansonyuu.](https://github.com/ansonyuu/matchmaking) \
Using the sentence-transformers library, the text is transformed into a large array. UMAP is then used to reduce the dimensions of the array to two.\
I found this [guide by StatQuest](https://youtu.be/eN0wFzBA4Sc) to be a useful introduction to the process.\
The result is an x and y value for each quote that represents its meaning and can be ploted. The clustering of \
the points should represent the similarity of the quotes. Color is added to represent the publishing date of the quote.
Hover over the points to see the quote and date it was published. Click and drag to select points and view them in the table below.\
Double click to reset the view. Try entering a quote of your own to find the most similar quote in the dataset.
'''),
html.Br(),
#html.Div("Select a language model to use:"),
#dcc.Dropdown(id='language_model',
# options=language_model_choices,
# value='all-MiniLM-L6-v2',
# style={'width': '50%'}),
#html.Br(),
# html.Div("Select a dimensionality reduction method to use:"),
# dcc.Dropdown(id='reducer',
# options=reducer_choices,
# value='UMAP',
# style={'width': '50%'}),
# html.Br(),
# html.Div("Select a Scaling method to use:"),
# dcc.Dropdown(id='scaler',
# options=scalers_choices,
# value='Standard Scaler',
# style={'width': '50%'}),
# html.Br(),
# html.Button(id='calc_data',
# n_clicks=0,
# children='Calculate Data'),
#html.Br(),
dcc.Markdown('''
## Quote Similarity
Enter a quote of your own to find the most similar quote in the dataset.'''),
dcc.Input(id='user_quote_input', type='text', value=''),
html.Br(),
html.Div(id='quote-similarity'),
html.Br(),
graph_comp,
html.Br(),
html.Br(),
html.Div(table_comp),
])
#### Helper Functions [CURRENTLY UNIMPLEMENTED]
# Could use case-match in python 3.10+
# Ideally the object would be the key to the dropdown but it it isn't JSON serializable
# def create_sentence_transformer(language_model):
# try:
# model = SentenceTransformer(language_model)
# except ValueError:
# ValueError('Language model not found, confirm package is installed.')
# return model
# def create_reducer(reducer):
# if reducer == 'UMAP':
# reducer = umap.UMAP()
# elif reducer == 'PCA':
# reducer = PCA()
# elif reducer == 't-SNE':
# reducer = TSNE()
# else:
# ValueError('Reducer not found, confirm package is installed.')
# return reducer
# def create_scaler(scaler):
# if scaler == 'StandardScaler':
# scaler = StandardScaler()
# elif scaler == 'MinMaxScaler':
# scaler = MinMaxScaler()
# elif scaler == 'RobustScaler':
# scaler = RobustScaler()
# elif scaler == 'MaxAbsScaler':
# scaler = MaxAbsScaler()
# elif scaler == 'QuantileTransformer':
# scaler = QuantileTransformer()
# elif scaler == 'PowerTransformer':
# scaler = PowerTransformer()
# elif scaler == 'Normalizer':
# scaler = Normalizer()
# elif scaler == 'Binarizer':
# scaler = Binarizer()
# else:
# ValueError('Scaler not found, confirm package is installed.')
# return scaler
#### Callbacks [CURRENTLY UNIMPLEMENTED]
# @app.callback(Output('scatter', 'figure'),
# Input('calc_data', 'n_clicks'),
# State('language_model','value'),
# State('reducer','value'),
# State('scaler','value'))
# def recalculate_data(n_clicks, language_model, reducer, scaler):
# print(n_clicks, language_model, reducer, scaler)
# model = create_sentence_transformer(language_model)
# reducer = create_reducer(reducer)
# scaler = create_scaler(scaler)
# # construct the sentence transformer
# paragraphs = quotes['Quote'].tolist()
# embeddings = model.encode(paragraphs)
# # Scale and Reduce the data
# scaled_data = scaler.fit_transform(list(embeddings))
# reduce_data = reducer.fit_transform(scaled_data)
# quotes['Dimension_1'] = reduce_data[:, 0].tolist()
# quotes['Dimension_2'] = reduce_data[:, 1].tolist()
# quotes['id'] = quotes.index
# fig = px.scatter(
# quotes,
# x="Dimension_1",
# y="Dimension_2",
# color="Date",
# custom_data=['id'],
# labels={'Dimension_1': '', 'Dimension_2': ''},
# hover_name='Quote',
# hover_data={'Date': True, 'Dimension_1': False, 'Dimension_2': False}
# ).update_layout(dragmode='select')
# return fig
# need access to the model outside that first callback, run on initilization?
@app.callback(Output('quote-similarity', 'children'), Input('user_quote_input', 'value'))
def update_similarity(user_quote_input):
if user_quote_input:
quote_embedding = model.encode(user_quote_input)
similarities = 1 - spatial.distance.cdist([quote_embedding], scaled_data, "cosine")[0]
most_similar = quotes.iloc[similarities.argmax()]
return f'The most similar quote is: {most_similar["Quote"]}'
else:
return ' '
# Error here
@app.callback(Output('table-comp', 'data'), Input('scatter', 'selectedData'))
def update_table(selectedData):
if selectedData is None:
highlighted_id = []
else:
highlighted_id = [point['customdata'][0] for point in selectedData['points']]
return quotes[quotes['id'].isin(highlighted_id)].to_dict('records')
if __name__ == '__main__':
app.run_server(debug=True)