-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtda_utils.py
366 lines (185 loc) · 9.56 KB
/
tda_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
import numpy as np
from tqdm.auto import tqdm
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
# Not a perfect transition matrix because I do not obtain a sum of 1 in bith lines and columns, but works for what I want to do.
def generate_transition_matrix(themes, variability=0.2):
"""
This method generate a transition matrix using a list of elements and a variability paramteter
Inputs:
themes: List of elements (in our case list of themes)
variability: float to control the probability distribution of the matrix, a lower value give high probability in the diagonal and vise versa
Output:
Transition Matrix of shape (len(themes), len(themes))
"""
#size of the transition matrix
size = len(themes)
#Create a distribution with a probability of 1 to stay in the same state and add a drift
transition_matrix = np.identity(size) + np.random.uniform(low=0, high=variability, size=(size, size))
#Normalize to get a sum of 1 for each line
transition_matrix = transition_matrix / np.sum(transition_matrix, axis=1, keepdims=True)
return transition_matrix
def generate_dirichlet(size, alpha):
"""
Generate a list of proabilities where the sum is equal to 1, control the entropy using the alpha parameter,higher alpha for a more uniform probability.
Inputs:
size: (int) The size of the list
alpha: (float) control the entropy of the output
Ouput:
List of probabilities with the specified size
"""
return np.random.dirichlet(np.ones(size)*alpha, size=1)[0]
def generate_themes_documents_probabilities(data, alpha):
"""
Generate a list of probabilities of the documents picking for each theme.
Inputs:
data: A dictionary that contains the themes names and documents titles
alpha: alpha: (float) control the entropy of the output
Outpus:
Lists of probabilities for each themes
"""
result = []
for theme in data:
result.append(generate_dirichlet(len(data[theme]), alpha))
return result
#I know it's not perfect because usually when
def generate_session(data, themes_transitions, documents_probabilities, session_len):
"""
Generate fake session user using a theme transition matrix, a document picking probability and a session length
Inputs:
data: A dictionary that contains the themes names and documents titles
themes_transitions: 2D array transition matrix that contains the probabilities to go from one thme to another
docuements_probabilities: 2D array that contains the probabilities of picking a document
session_len: (int) The size of the generated sequence
Output:
List of couples (theme, document)
"""
session = []
themes = list(data.keys())
#Select a starting theme randomly (uniform probability)
theme_id = np.random.choice(np.arange(themes_transitions.shape[0]))
selected_theme = themes[theme_id]
#Select the first docuement and add it to the list
document = np.random.choice(data[selected_theme], p=documents_probabilities[theme_id])
session.append((selected_theme, document))
for _ in range(session_len-1):
theme_id = np.random.choice(np.arange(themes_transitions.shape[0]), p=themes_transitions[theme_id])
selected_theme = themes[theme_id]
#Select the first docuement and add it to the list
document = np.random.choice(data[selected_theme], p=documents_probabilities[theme_id])
session.append((selected_theme, document))
return session
# To generate sessions of the same behaviour
def generate_sessions(data, themes_transitions, documents_probabilities, n_sessions, length_category='medium'):
"""
Generate a list of sessions that have the same length category
Inputs:
data: A dictionary that contains the themes names and documents titles
themes_transitions: 2D array transition matrix that contains the probabilities to go from one thme to another
docuements_probabilities: 2D array that contains the probabilities of picking a document
n_sessions: (int) The number of sessions to generate
length_category: (str) The size category of session "small" (sequence of 1 to 2), "medium" (3 to 5) or "long" (5 to 20)
Output:
List of sessions
"""
categories = ["small", "medium", "long"]
assert length_category in categories, "Unknown specified length category please choose between small, medium or long"
if length_category == categories[0]:
sequences_sizes = [1, 2]
return [generate_session(data, themes_transitions, documents_probabilities, np.random.choice(sequences_sizes)) for _ in range(n_sessions)]
elif length_category == categories[1]:
sequences_sizes = np.arange(start=3, stop=6)
return [generate_session(data, themes_transitions, documents_probabilities, np.random.choice(sequences_sizes)) for _ in range(n_sessions)]
else:
sequences_sizes = np.arange(start=5, stop=20)
return [generate_session(data, themes_transitions, documents_probabilities, np.random.choice(sequences_sizes)) for _ in range(n_sessions)]
def generate_dataset(behaviours_specify, data):
"""
Generate a dataset of diffrent sessions depending on the list of sepecfied behaviours
Inputs:
behaviours_specify: List of parameters to construct sessions to respect a behaviour. (variability, alpha, n_session, length category)
data: data: A dictionary that contains the themes names and documents titles
Output:
List of diffrent sessions of diffrent behaviours
"""
result = []
#Generate transitions matrices and their associated documents probabilities
themes_transitions_matrices = [generate_transition_matrix(data.keys(), variability=b[0]) for b in behaviours_specify]
documents_prob_pick = [generate_themes_documents_probabilities(data, alpha=b[1]) for b in behaviours_specify]
for i, b in enumerate(tqdm(behaviours_specify)):
result += generate_sessions(data, themes_transitions_matrices[i], documents_prob_pick[i], b[2], b[3])
return result
def remove_stop_words(sentence):
"""
Remove stop words and ponctuation in a sentence and return a filtred list of words
Input:
sentence: (str)
Output:
A list of filtred words
"""
cleaned_text = re.sub(r'[^\w\s]|l\'', '', sentence)
words = word_tokenize(cleaned_text)
filtered_words = [word.lower() for word in words if word.lower() not in stopwords.words('french')]
return filtered_words
def get_document_words(document):
"""
Get a list of words associated to a document (theme + words in title)
Input:
document: (dict) It contains the title and the theme
Output:
A filtred list of words
"""
representation = [document[0].lower()]
representation += remove_stop_words(document[1]["title"])
return representation
def get_session_words(session):
"""
Get a set of words in a specific session
Input:
session: List of dictionaries that contains the themes and the titles
Output:
A set of filtred words
"""
result = []
for document in session:
result += get_document_words(document)
return set(result)
def get_session_representation(session, model):
"""
Get a representation of a session based on the words in the title and the visited themes
Inputs:
session: List of documments that have titles and themes
model: The word2vec model to get representation of words
Output:
Numpy array of the representations
"""
session_words = get_session_words(session)
return np.array([model[word] for word in session_words if word in model.key_to_index]).T
def get_svd_session_representation(session_representation):
"""
Get the first eigen vector of the matrix session representation
Input:
session_representation: 2D Matrix produced by the egt_session_representation method
Output:
A 1D numpy array (first eigen vector)
"""
return np.linalg.svd(session_representation)[0][0,:]
def get_sessions_representation(sessions, model):
"""
Return the first eigen vector for each session representation matrix
Input:
sessions: List of sessions (one session is alist of picked documents)
model: The word2vec model to get the representation of each word in the set of words in session
Output:
A list of vectors each vector represent a session
"""
print("Vectorize sessions...")
sessions_word2vec = [get_session_representation(session, model) for session in tqdm(sessions)]
print("Done!\nCompute SVD sessions")
sessions_reduced = [get_svd_session_representation(session) for session in tqdm(sessions_word2vec)]
print("Done!")
return sessions_reduced