log_reg_util_small.py

# File for util

import collections
import numpy as np
import util

def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """

    # *** START CODE HERE ***
    tokens = message.split(" ")
    return [token.lower() for token in tokens]
    # *** END CODE HERE ***


def create_dictionary(messages, min_frequency=5):
    """Create a dictionary mapping words to integer indices.

    This function should create a dictionary of word to indices using the provided
    training messages. Use get_words to process each message.

    Rare words are often not useful for modeling. Please only add words to the dictionary
    if they occur in at least five messages.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """

    # *** START CODE HERE ***
    counter = collections.Counter()
    for message in messages:
        counter.update(set(get_words(message)))
    
    common_words = [word for word, count in counter.items() if count >= min_frequency]
    return {word: index for index, word in enumerate(common_words)}
    # *** END CODE HERE ***


def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    This function should create a numpy array that contains the number of times each word
    of the vocabulary appears in each message. 
    Each row in the resulting array should correspond to each message 
    and each column should correspond to a word of the vocabulary.

    Use the provided word dictionary to map words to column indices. Ignore words that
    are not present in the dictionary. Use get_words to get the words for a message.

    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
        Where the component (i,j) is the number of occurrences of the
        j-th vocabulary word in the i-th message.
    """
    # *** START CODE HERE ***
    count_arr = np.zeros((len(messages), len(word_dictionary)))
    for index, message in enumerate(messages):
        for word in get_words(message):
            if word in word_dictionary:
                count_arr[index, word_dictionary[word]] += 1
    
    return count_arr
    # *** END CODE HERE ***