forked from lachlanpage/Markov-Chain-Sentence-Generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_utilities.py
183 lines (143 loc) · 7.69 KB
/
text_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import random
from colorama import init, Fore, Style
from log_config import configure_logger
from config import Config
# Initialize colorama
init(autoreset=True)
class TextGenerator:
# This class is used to generate training_corpus_filename from a given corpus file.
# Initializes the class by configuring a logger with the current module's name
# and enabling auto-reset for the terminal color output.
def __init__(self):
self.logger = configure_logger(__name__)
init(autoreset=True)
# This method is used to return the training_corpus_filename from a given corpus file.
# The corpus file should be in UTF-8 format. But combining both the errors='replace' parameter
# and the try/except block provides more robust error handling for
# various issues that may arise when working with input files.
@staticmethod
def return_corpus_text(corpus_file_name):
"""
Reads the content of a corpus file and returns it as a string.
Args:
corpus_file_name (str): The path of the corpus file to be read.
Returns:
str: The content of the corpus file as a string. If an error occurs while reading the file,
an empty string is returned and an error message is printed.
Raises:
None
"""
try:
with open(corpus_file_name, encoding='utf-8', errors='replace') as content_file:
corpus_as_string = content_file.read()
return corpus_as_string
except Exception as e:
print(f"{Fore.RED}[-] Error while reading the corpus file '{corpus_file_name}': {e}{Style.RESET_ALL}")
return ""
# This method is used to convert the list of words into a string from a given corpus file.
@staticmethod
def convert_word_list_to_string(final_output_word_list):
return ' '.join(final_output_word_list)
def generate_text(self, corpus_file_name, prefix_length, output_words_length, seed_words=None):
"""
Generate random text based on the input corpus file content, given prefix and word lengths.
This function reads the corpus file, cleans up the corpus string, applies a Markov algorithm to generate
a list of output words, and then cleans up the output.
:param corpus_file_name: The path to the input corpus file.
:type corpus_file_name: str
:param prefix_length: The length of the prefix as a tuple for the transition dictionary.
:type prefix_length: int
:param output_words_length: The number of words in the generated text.
:type output_words_length: int
:param seed_words: Initial words to seed the Markov algorithm. If None, the algorithm chooses initial words.
:type seed_words: tuple, optional
:returns: A list of words forming the generated text.
:rtype: list
"""
corpus_as_string = self.return_corpus_text(corpus_file_name)
corpus_as_string = self.clean_up_corpus_string(corpus_as_string)
output_word_list = self.markov_algorithm(corpus_as_string, output_words_length, prefix_length, seed_words)
# TODO: Should this return the output_word_list?
self.clean_up_markov_output(output_word_list)
return output_word_list
@staticmethod
def clean_up_markov_output(output_word_list):
# This method is used to clean up the output of the Markov algorithm.
# Remove words that shouldn't be at the end of a sentence.
if output_word_list[-1] in ["and", "i", "mr"]:
output_word_list.pop()
# Remove words that shouldn't be at the beginning of a sentence.
if output_word_list[0] in ["and", "him"]:
output_word_list.pop(0)
def markov_algorithm(self, corpus_as_string, output_words_length, prefix_length, seed_words):
"""
Generate a list of output words using a Markov chain algorithm.
The function processes the input corpus string to create a dictionary of sequences and their next words.
It then uses the Markov chain algorithm to generate a sequence of words.
:param corpus_as_string: The input corpus as a single string.
:type corpus_as_string: str
:param output_words_length: The number of words to generate.
:type output_words_length: int
:param prefix_length: The length of the prefix for the Markov chain to predict the next word.
:type prefix_length: int
:param seed_words: Initial words to seed the Markov chain. Optional.
:type seed_words: tuple
:returns: A list of words forming the generated text.
:rtype: list
"""
# prefix_length determines which order of Markov
# meaning how many words to look backwards to predict the next word.
chain = {tuple(['.'] * prefix_length): [' ']}
# Iterate through the corpus and create a dictionary of sequences and their next words.
for i in range(len(corpus_as_string) - prefix_length):
# the variable seq holds a tuple of characters, representing the sequence of length prefix_length
# starting at index i in the input string corpus_as_string.
seq = tuple(corpus_as_string[i: i + prefix_length])
# Initialize an empty list as the value for a new seq key in the chain dictionary
# if it doesn't already exist.
if seq not in chain:
chain[seq] = []
# If the next word is not a period, add it to the chain.
chain[seq].append(corpus_as_string[i + prefix_length] if i + prefix_length < len(corpus_as_string) else '.')
if seed_words is not None:
# start_seq = tuple(seed_words.split())
start_seq = seed_words
else:
start_seq = random.choice(list(chain.keys()))
output_word_list = list(start_seq)
current_seq = start_seq
for i in range(output_words_length):
try:
next_word = random.choice(chain[current_seq])
except KeyError:
if Config.VERBOSE:
not_found_message = (
f"The exact seed word sequence {Fore.RED}"
f"'{seed_words}'"
f"{Style.RESET_ALL} was not found in the original corpus."
)
self.logger.warning(f"{not_found_message}")
current_seq = random.choice(list(chain.keys()))
next_word = random.choice(chain[current_seq])
current_seq = tuple((list(current_seq) + [next_word])[1:])
output_word_list.append(next_word)
return output_word_list
@staticmethod
def clean_up_corpus_string(corpus_as_string):
"""
Cleans up the input corpus string by converting to lowercase, replacing special characters, and splitting
into a list of words.
This method processes the input string to convert all characters to lowercase, replace punctuation
with spaces, remove certain characters, and split the string into a list of words.
:param corpus_as_string: The input corpus as a single string.
:type corpus_as_string: str
:returns: The cleaned up corpus as a list of words.
:rtype: str
"""
corpus_as_string = corpus_as_string.lower()
for char in ["!", ".", ",", "@", "&", "?", "-"]:
corpus_as_string = corpus_as_string.replace(char, " ")
for char in ['"', '(', ')']:
corpus_as_string = corpus_as_string.replace(char, "")
corpus_as_string = corpus_as_string.split()
return corpus_as_string