forked from guyemerson/somali
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
448 lines (374 loc) · 18.2 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
import csv
import pickle
import os
import numpy as np
from warnings import warn
from features import (get_global_set, feature_list_and_dict, vectorise,
document_frequency, Vectoriser)
def save_pkl_txt(name_freq, filename, directory='../data'):
"""
Save a list of names and frequencies, in both .pkl and .txt format
:param name_freq: list of (name, frequency) pairs
:param filename: name of output files (without file extension)
:param directory: directory of data files (default ../data)
"""
with open(os.path.join(directory, filename + '.pkl'), 'wb') as f:
pickle.dump(name_freq, f)
with open(os.path.join(directory, filename + '.txt'), 'w') as f:
for name, freq in name_freq:
f.write('{}\t{}\n'.format(name, freq))
def save(msgs, code_vecs, code_names, output_file, extractor=None,
vectoriser=None, directory='../data'):
"""
Save features and codes to file
:param msgs: list of strings (supervised learning input)
:param code_vecs: boolean numpy matrix of codes (supervised learning output)
- rows correspond to elements in feat_bags
- columns correspond to elements in code_names
:param code_names: list of names of codes
:param output_file: name of output file (without .pkl file extension)
- as well as saving to example.pkl, also saves to:
- example_codes.pkl (list of names of codes, with frequencies)
- example_codes.txt (as above, but human-readable)
- and if a feature extractor is given rather than a vectoriser, also saves to:
- example_features.pkl (list of names of features, with frequencies)
- example_features.txt (as above, but human-readable)
:param extractor: function mapping strings to bags of features
:param vectoriser: function mapping lists of strings to numpy arrays
:param directory: directory of data files (default ../data)
"""
# Check that input dimensions match
N = len(msgs)
K = len(code_names)
if code_vecs.shape != (N, K):
raise ValueError('Dimensions do not match')
# Convert the messages to feature vectors
if vectoriser:
feat_vecs = vectoriser(msgs)
else:
# If we just have a feature extractor, we must define indices of features
# Extract features
feat_bags = [extractor(m) for m in msgs]
# Get the global set of features
feat_set = get_global_set(feat_bags)
feat_list, feat_dict = feature_list_and_dict(feat_set)
# Convert messages to vectors
feat_vecs = vectorise(feat_bags, feat_dict)
# Find the document frequency of each feature and save features to file
feat_freq = (feat_vecs != 0).sum(0)
# Convert from Numpy to Python data types
feats = list(zip(feat_list, [int(x) for x in feat_freq]))
save_pkl_txt(feats, output_file + '_features', directory)
# Find the frequency of each code
code_freq = code_vecs.sum(0)
# Get the global set of features
feat_set = get_global_set(feat_bags)
feat_list, feat_dict = feature_list_and_dict(feat_set)
# Convert messages to vectors
feat_vecs = vectorise(feat_bags, feat_dict)
# Find the document frequency of each feature
feat_freq = (feat_vecs != 0).sum(0)
# Save the features and codes to file,
# as a list of names and frequencies
feats = list(zip(feat_list, feat_freq))
with open(os.path.join(directory, output_file + '_features.pkl'), 'wb') as f:
pickle.dump(feats, f)
codes = list(zip(code_names, code_freq))
with open(os.path.join(directory, output_file + '_codes.pkl'), 'wb') as f:
pickle.dump(codes, f)
# Save the codes to file
# Convert from Numpy to Python data types
codes = list(zip(code_names, [int(x) for x in code_freq]))
save_pkl_txt(codes, output_file + '_codes', directory)
print('Codes:')
print(*codes, sep='\n')
# Save the input and output matrices
with open(os.path.join(directory, output_file + '.pkl'), 'wb') as f:
pickle.dump((feat_vecs, code_vecs), f)
def preprocess_long(input_file, output_file, extractor=None, vectoriser=None,
directory='../data', text_col=2, ignore_cols=(),
convert=bool):
"""
Preprocess a csv file to feature vectors and binary codes,
where the input data has a 0 or 1 for each code and message
:param input_file: input file name (without .csv file extension)
:param output_file: output file name (without .pkl file extension)
:param extractor: function mapping strings to bags of features
:param vectoriser: function mapping lists of strings to numpy arrays
:param directory: directory of data files (default ../data)
:param text_col: index of column containing text
:param ignore_cols: indices of columns to ignore
:param convert: function to convert code strings (e.g. bool or int)
"""
if extractor is None and vectoriser is None:
raise TypeError('Either extractor or vectoriser must be given')
if extractor and vectoriser:
raise TypeError('Only one of extractor and vectoriser should be given')
# Extract features and codes
# We can vectorise the codes immediately, but for features, we first need a global list
msgs = []
code_vecs = []
with open(os.path.join(directory, input_file + '.csv'), newline='') as f:
# Process the file as a CSV file
reader = csv.reader(f)
# Find the headings (the first row of the file)
headings = next(reader)
# Restrict ourselves to a subset of columns (not containing text, and not ignored)
code_cols = sorted(set(range(len(headings))) - {text_col} - set(ignore_cols))
code_names = [headings[i] for i in code_cols]
# Iterate through data
for row in reader:
# Get the bag of features, and the vector of codes
msgs.append(row[text_col])
code_vecs.append(np.array([convert(row[i]) for i in code_cols], dtype='bool'))
# Convert the list of code vectors to a matrix
code_vecs = np.array(code_vecs)
# Save the information
save(msgs, code_vecs, code_names, output_file, extractor, vectoriser, directory)
def preprocess_pairs(input_file, output_file, extractor=None, vectoriser=None,
directory='../data', text_col=0, ignore_cols=(),
uncoded=('', 'NM'), triples=False):
"""
Preprocess a csv file to feature vectors and binary codes,
where the input data has groups of codes,
and each message has up to two codes from each group.
Each group must take up exactly two columns.
:param input_file: input file name (without .csv file extension)
:param output_file: output file name (without .pkl file extension)
:param extractor: function mapping strings to bags of features
:param directory: directory of data files (default ../data)
:param text_col: index of column containing text
:param ignore_cols: indices of columns to ignore
:param uncoded: strings to be interpreted as lacking a code
"""
if extractor is None and vectoriser is None:
raise TypeError('Either extractor or vectoriser must be given')
if extractor and vectoriser:
raise TypeError('Only one of extractor and vectoriser should be given')
# Extract features and codes
# We cannot vectorise these until we have a global list
msgs = []
code_sets = []
with open(os.path.join(directory, input_file + '.csv'), newline='') as f:
# Process the file as a CSV file
reader = csv.reader(f)
# Find the headings (the first row of the file)
headings = next(reader)
# Restrict ourselves to a subset of columns (not containing text, and not ignored)
code_cols = sorted(set(range(len(headings))) - {text_col} - set(ignore_cols))
# Group columns in pairs
pair_indices = list(zip(code_cols[::2], code_cols[1::2]))
pair_names = [headings[i][:-1].strip() for i in code_cols[::2]]
if triples:
# 3 reasons in HIV/AIDS data
pair_indices = [(4, 5, 6), (7, 8, 9)]
pair_names = [headings[i][:-1].strip() for i in code_cols[::3]]
print('names: ', pair_names)
# Find features and codes
for row in reader:
# Find words in message
msgs.append(row[text_col])
# Find codes
row_code_set = set()
for name, inds in zip(pair_names, pair_indices):
# The code is recorded as a tuple (pair_name, value)
# If a code is repeated, it is only counted once
row_code_set |= {(name, row[i])
for i in inds if row[i] not in uncoded}
code_sets.append(row_code_set)
# Get the global set of codes, and convert to vectors
codes = get_global_set(code_sets)
K = len(codes)
code_list, code_dict = feature_list_and_dict(codes)
def vectorise_codes(set_of_codes):
"""
Convert names of codes to a vector
:param set_of_codes: set of code names
:return: numpy array
"""
vec = np.zeros(K, dtype='bool')
for c in set_of_codes:
vec[code_dict[c]] = True
return vec
code_vecs = np.array([vectorise_codes(x) for x in code_sets])
# Save the information
save(msgs, code_vecs, code_list, output_file, extractor, vectoriser,
directory)
def preprocess_keywords(keyword_file, feature_file, output_file=None,
directory='../data'):
"""
Preprocess the keywords, converting words to feature indices
The input file should have one line per code, with keywords separated
by commas.
Each keyword should be either a single word or a bigram.
For readability, the line can begin with the name of the code, separated
by a tab.
e.g.: (note the tab character)
"Emotional Causes walwal, isla hadal"
Note that the order of the codes should match the order used in the above
functions.
The output file will be a pickled list of lists of feature indices
:param keyword_file: input file name (without .txt file extension)
:param feature_file: file containing the global list of features
(without .pkl file extension)
:param output_file: output file name (without .pkl file extension)
(default is the same as the keyword file, with a different file extension)
:param directory: directory of data files (default ../data)
"""
# Set output file name, if not given
if output_file is None:
output_file = keyword_file
# Load features
with open(os.path.join(directory, feature_file + '.pkl'), 'rb') as f:
feats = pickle.load(f)
# Ignore frequency information
feat_list = [x for x, _ in feats]
# Convert to a dict
feat_dict = {x: i for i, x in enumerate(feat_list)}
# Read keyword file
with open(os.path.join(directory, keyword_file + '.txt')) as f:
full_list = []
for line in f:
# Get the keywords
parts = line.split('\t')
keywords = [x.split() for x in parts[-1].split(',')]
indices = []
for k in keywords:
# Lookup each keyword either as a single word, or as a bigram
try:
if len(k) == 1:
indices.append(feat_dict['word', k[0]])
elif len(k) == 2:
indices.append(feat_dict['ngram', tuple(k)])
else:
print(line, k)
raise ValueError('Keywords must be one or two words'
'long')
except KeyError:
warn("Keyword '{}' could not be found as a feature"
.format(' '.join(k)))
# Add to the full list
full_list.append(indices)
# Save the keyword indices to file
with open(os.path.join(directory, output_file + '.pkl'), 'wb') as f:
pickle.dump(full_list, f)
def iter_bags_of_features(input_files, extractor, directory='../data',
text_col=0):
"""
Extract features from all messages, and filter by document frequency
:param input_files: single filename, or list of filenames (without .csv
file extension)
:param extractor: function mapping strings to bags of features
:param directory: directory of data files (default ../data)
:param text_col: index of column containing text (default 0)
:return: iterator yielding bags of features, one per message
"""
# If only one file is given, convert to a list
if isinstance(input_files, str):
input_files = [input_files]
# Iterate through files
for filename in input_files:
with open(os.path.join(directory, filename + '.csv'), newline='') as f:
# Process the file as a CSV file
reader = csv.reader(f)
# Ignore headings
next(reader)
# Iterate through messages
for row in reader:
yield extractor(row[text_col])
def extract_features_and_idf(input_files, output_file, extractor,
threshold=None, directory='../data', text_col=0):
"""
Extract features from all messages, and filter by document frequency
Creates a Vectoriser that can convert messages to feature vectors weighted
by idf
:param input_files: single filename, or list of filenames (without .csv
file extension)
:param output_file: name of output file (without .pkl file extension)
- as well as saving to example.pkl, also saves to:
- example_features.pkl (list of names of features, with frequencies)
- example_features.txt (as above, but human-readable)
:param extractor: function mapping strings to bags of features
:param threshold: minimum document frequency to keep a feature
:param directory: directory of data files (default ../data)
:param text_col: index of column containing text (default 0)
"""
# Get iterator over bags of features
bags = iter_bags_of_features(input_files, extractor, directory, text_col)
# Get document frequency
freq = document_frequency(bags)
# Filter out rare features
if threshold is not None:
freq = {feat: n for feat, n in freq.items() if n >= threshold}
# Assign indices to features
feat_list, feat_dict = feature_list_and_dict(freq.keys())
# Get idf array
idf = np.empty(len(feat_list))
for feat, n in freq.items():
idf[feat_dict[feat]] = 1 / n
# Create and save Vectoriser
vectoriser = Vectoriser(extractor, feat_dict, idf)
with open(os.path.join(directory, output_file + '.pkl'), 'wb') as f:
pickle.dump(vectoriser, f)
# Save list of features
feat_freq = [(feat, freq[feat]) for feat in feat_list]
save_pkl_txt(feat_freq, output_file + '_features', directory)
def both_funcions(a_word):
'''
This function is an attempt to clean up of of the code below or rather
have the functions preprocess_long and preprocess_keywords run in simple
seqeunce
:param a_word:
'''
# Preprocess individual files with an extractor
preprocess_long('wash_training_long_1005', a_word, feature_extractor,
ignore_cols=[0, 1], text_col=2)
# Preprocess keywords
codes = '{}_codes'.format(a_word)
feat = '{}_features'.format(a_word)
preprocess_keywords(codes, feat)
print('It is done')
if __name__ == "__main__":
from features import bag_of_words
# Extract both single words and bigrams
from features import (bag_of_words, bag_of_ngrams,
bag_of_variable_character_ngrams, apply_to_parts,
combine)
bag_of_words_parts = apply_to_parts(bag_of_words, '&&&')
bag_of_ngrams_parts = apply_to_parts(bag_of_ngrams, '&&&')
#functions = [bag_of_words_parts, bag_of_ngrams_parts]
functions = [bag_of_words, bag_of_ngrams]
kwargs = [{}, {'n': 2}]
#feature_extractor = bag_of_words_parts
feature_extractor = combine(functions, kwarg_params=kwargs)
# both_funcions('wash')
### Define feature vectors based on a whole corpus
'''
input_files = ['malaria_original', 'wash_original', 'nutrition_original', 'ANC_Delivery Training Set.xlsx - Short']
extract_features_and_idf(input_files, 'four_combined', feature_extractor, 3)
'''
### Preprocess individual files with an extractor
#preprocess_long('malaria_training_long_1105', 'malaria', feature_extractor, ignore_cols=[0,1], text_col=2)
#preprocess_long('wash_training_long_1005', 'wash', feature_extractor, ignore_cols=[0, 1], text_col=2)
preprocess_long('wash_s04_training_long_1705', 'wash_s04', feature_extractor, ignore_cols=[0, 1], text_col=2)
#preprocess_pairs('wash_original', 'wash', feature_extractor, ignore_cols=[1,2,13,14])
'''
preprocess_long('nutrition_original', 'nutrition', feature_extractor, ignore_cols=[1,13,14], convert=bool)
preprocess_long('ANC_Delivery Training Set.xlsx - Short', 'delivery', feature_extractor, convert=int)
'''
### Preprocess individual files with a vectoriser
'''
with open('../data/four_combined.pkl', 'rb') as f:
vecr = pickle.load(f)
preprocess_pairs('malaria_original', 'malaria', vectoriser=vecr, ignore_cols=[1,6])
preprocess_pairs('wash_original', 'wash', vectoriser=vecr, ignore_cols=[1,2,13,14])
preprocess_long('nutrition_original', 'nutrition', vectoriser=vecr, ignore_cols=[1,13,14], convert=bool)
preprocess_long('ANC_Delivery Training Set.xlsx - Short', 'delivery', vectoriser=vecr, convert=int)
'''
### Preprocess keywords
#preprocess_keywords('malaria_keywords', 'malaria_features')
#preprocess_keywords('wash_codes', 'wash_features')
'''
preprocess_keywords('nutrition_keywords', 'nutrition_features')
preprocess_keywords('delivery_keywords', 'delivery_features')
'''