-
Notifications
You must be signed in to change notification settings - Fork 7
/
5C_compose_text.py
78 lines (66 loc) · 3.02 KB
/
5C_compose_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
from tqdm import tqdm
from pprint import pprint
from random import randrange, randint
from fuzzywuzzy import fuzz
# # Custom shuffle function to mix up the beginning of the text with its more frequently repeating words
def fuzzy_knuth_shuffle(l):
end = (len(l) - 1)
i = 0
while i < end:
fuzzyness = max(1, int(10 - 10 * ((i/end)**2))) # Less shuffling necessary for less relevant (fewer) words
if any(letter in l[i] for letter in ['.', ',', ';', '_', '-']):
fuzzyness += 10
stop = min(i + fuzzyness, end)
idx = randrange(i, stop)
l[i], l[idx] = l[idx], l[i]
i += 1
word_cache = list()
word_scores = list()
with open("result/relevant_words.json", "r") as inputFile:
collected_words = json.load(inputFile)
for [key, [score, words]] in tqdm(collected_words.items()):
for word in words:
word_score = 0
for [key, [score, words]] in collected_words.items():
if key in word:
word_score += int(score)
word_score = int(word_score / len(word))
if not word in word_cache and word_score > 950:
if len(word) > 3 and not any(fuzz.partial_ratio(word, w) == 100 for w in word_cache):
word_cache.append(word)
word_scores.append((word_score, word))
word_scores = sorted(word_scores, reverse=True)
selected_words = dict()
selected_words['lower'] = list()
selected_words['upper'] = list()
for word_score, word in word_scores[:500]:
case = 'upper' if word.isupper() else 'lower'
selected_words[case].append(word)
# Loosen up the list of words to avoid obvious repetitions
fuzzy_knuth_shuffle(selected_words['lower'])
fuzzy_knuth_shuffle(selected_words['upper'])
with open("result/relevant_words.json", "r") as inputFile:
for [key, [score, words]] in json.load(inputFile).items():
share = int(max(min((score / 1000), 10), 1)) # scale score value to a number between 0 and 9
case = 'upper' if key.isupper() else 'lower'
# Count occurances of key in selected words
n = 0
for word in selected_words[case]:
if key in word:
share -= 1
share = max(0, min(len(words), share)) # make sure that this share limit is not larger then the number of words
for word in words[:share]:
if not word in selected_words[case] and not any(letter in word for letter in ['{', '}', '[', ']', '\\', '_', '<', '>', ]):
# Mix the more relevant words with the existing text (made of the most relevant words)
if score > 700:
selected_words[case].insert(randint(0, len(selected_words[case])), word)
else:
selected_words[case].append(word)
text = ''
for case, words in selected_words.items():
# knuth_shuffle(words)
text += ' '.join(words) + '\n\n'
print(case, len(words))
with open("result/relevant_words.txt", "w") as output:
output.write(text)