-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentencegen_sol.py
126 lines (99 loc) · 3.52 KB
/
sentencegen_sol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import requests, string, random
''' pip install requests or pip3 install requests '''
hp_text = requests.get("http://www.glozman.com/TextPages/Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt")
''' to print the whole corpus, run "print hp_text.text" '''
unique_words = {}
text = hp_text.text
# Remove punctation from text
translator = str.maketrans('', '', string.punctuation)
text = text.translate(translator)
# hp_words is a list of each word in the text
# (in order of appearance, duplicates included)
hp_words = [word.replace('"', '') for word in text.split()]
total_words_in_book = len(hp_words)
# Get the number of times of each word in the text appears
for line in hp_words:
for word in line.split():
word = word.lower()
if word not in unique_words:
unique_words[word] = 1
else:
unique_words[word] += 1
###
# First find the probability of a given unigram in the corpus; P(w_i)
###
unigram_probs = {}
def get_all_unigrams():
for word in unique_words:
# probabilty = # occurrences/total # words
unigram_probs[word] = (unique_words[word]/total_words_in_book)
def unigram(w1):
return unigram_probs[w1.lower()]
get_all_unigrams()
print(unigram("Harry"))
###
# Now find all bigrams in the corpus and order them from most popular
# to least; P(w_i | w_j)
# hint: sorting based on probabilty/frequency
###
bigram_counts = {}
def get_all_bigrams():
for i in range(len(hp_words)-1):
key = (hp_words[i].lower(), hp_words[i+1].lower())
if key in bigram_counts:
bigram_counts[key] += 1
else:
bigram_counts[key] = 1
get_all_bigrams()
bigram_counts_list = sorted(bigram_counts.items(), key=lambda pair: -pair[1])
# Get 20 most popular bigrams
print(bigram_counts_list[:20])
###
# Find the probability of a specific bigram in the corpus; P(w_i | w_j)
###
# Method 1: uses the bigram_counts dictionary
def get_bigram_1(w1, w2):
w1 = w1.lower()
w2 = w2.lower()
w1_and_w2 = (w1, w2)
return bigram_counts[w1_and_w2]/unique_words[w1]
# Method 2: doesn't use bigram_counts
def get_bigram_2(w1, w2):
w1 = w1.lower()
w2 = w2.lower()
w1_and_w2 = w1 + " " + w2
# count stores the number of times the substring
# w1_and_w2 appears in the text
count = text.lower().count(w1_and_w2)
return count/unique_words[w1]
# Check equality
print("v1: ", get_bigram_1("sobbed", "hagrid"))
print("v2: ", get_bigram_2("sobbed", "Hagrid"))
###
# Sentence prediction/generation
###
# randomly generate the first word of your sentence
unique_words = [key for key in unique_words.keys()]
idx = random.randrange(0, len(unique_words), 1)
start_word = unique_words[idx]
# Generate a sentence given a word and a length of the sentence
# Hint: define a function which chooses the next word in your sentence
# based on weighted probabilites of bigrams
def get_sentence(word, l=20):
for i in range(l):
print(word, " ", end="")
second_word_options = [e for e in bigram_counts_list if e[0][0] == word]
if not second_word_options:
break
word = weighted_choice(second_word_options)[1]
print()
def weighted_choice(second_word_options):
total = sum(weight for (word, weight) in second_word_options)
threshold = random.uniform(0, total)
current_weight = 0
for (word, weight) in second_word_options:
if current_weight + weight > threshold:
return word
''' more likely to choose the next word then'''
current_weight += weight
get_sentence(start_word, 15)