-
Notifications
You must be signed in to change notification settings - Fork 0
/
log_reg_util_small.py
81 lines (60 loc) · 2.73 KB
/
log_reg_util_small.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File for util
import collections
import numpy as np
import util
def get_words(message):
"""Get the normalized list of words from a message string.
This function should split a message into words, normalize them, and return
the resulting list. For splitting, you should split on spaces. For normalization,
you should convert everything to lowercase.
Args:
message: A string containing an SMS message
Returns:
The list of normalized words from the message.
"""
# *** START CODE HERE ***
tokens = message.split(" ")
return [token.lower() for token in tokens]
# *** END CODE HERE ***
def create_dictionary(messages, min_frequency=5):
"""Create a dictionary mapping words to integer indices.
This function should create a dictionary of word to indices using the provided
training messages. Use get_words to process each message.
Rare words are often not useful for modeling. Please only add words to the dictionary
if they occur in at least five messages.
Args:
messages: A list of strings containing SMS messages
Returns:
A python dict mapping words to integers.
"""
# *** START CODE HERE ***
counter = collections.Counter()
for message in messages:
counter.update(set(get_words(message)))
common_words = [word for word, count in counter.items() if count >= min_frequency]
return {word: index for index, word in enumerate(common_words)}
# *** END CODE HERE ***
def transform_text(messages, word_dictionary):
"""Transform a list of text messages into a numpy array for further processing.
This function should create a numpy array that contains the number of times each word
of the vocabulary appears in each message.
Each row in the resulting array should correspond to each message
and each column should correspond to a word of the vocabulary.
Use the provided word dictionary to map words to column indices. Ignore words that
are not present in the dictionary. Use get_words to get the words for a message.
Args:
messages: A list of strings where each string is an SMS message.
word_dictionary: A python dict mapping words to integers.
Returns:
A numpy array marking the words present in each message.
Where the component (i,j) is the number of occurrences of the
j-th vocabulary word in the i-th message.
"""
# *** START CODE HERE ***
count_arr = np.zeros((len(messages), len(word_dictionary)))
for index, message in enumerate(messages):
for word in get_words(message):
if word in word_dictionary:
count_arr[index, word_dictionary[word]] += 1
return count_arr
# *** END CODE HERE ***