-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaverager.py
executable file
·110 lines (87 loc) · 3.76 KB
/
averager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python
#
# This code calculates average frequencies across a set of years of lyrics,
# to enable words to be manually chosen for the "any year" cards.
import re
import os
import random
import copy
import math
# total_occurrence_count = word -> count across all years
total_occurrence_count = {}
common_words = ['the', 'to', 'and', 'it', 'i', 'you', 'a', 'of', 'in', 'me',
'my', 'is', 'for', 'all', 'can', 'on', 'we', 'that', 'so',
'now', 'like', 'up', 'your', 'one', 'if', 'be', 'have', 'but',
'with', 'no', 'when', 'from', 'do', 'make',
'', 'us', 'what', 'where', 'see', 'will', 'at', 'into',
'this', 'out', 'why', 'are', 'away', 'go', 'then', 'not', 'our',
'got', 'only', 'come', 'had', 'was', 'would', 'how',
'down', 'too', "it's", 'has', 'way', 'am', 'as',
'than', 'were', 'who', 'been','an', 'wont', 'dont', 'thru',
'could', 'them', 'they', 'myself', 'bu', 'ba', 'da', 'cuz',
'off', 'by', 'or', 'o', 'u', 'its', 'aah', 'till', 'yay',
'love'] # Love is included another way
def process(occurrence_names, country, text):
words = {}
wordlist = re.split(r"[-\(\)\s\.,\?:!\"]+", text)
apos = re.compile("\xe2\x80|'|\xc2\xb4");
ooohhh = re.compile(r"^[oh]+$");
for word in wordlist:
word = word.lower()
# Remove words with apostrophes
if apos.search(word):
continue
# All forms of "ooohhh"
if ooohhh.search(word):
continue
words[word.lower()] = 1
for word in words:
if not word in occurrence_names:
occurrence_names[word] = []
occurrence_names[word].append(country);
return occurrence_names
years = (2013, 2014, 2015, 1016, 2017)
for year in years:
occurrence_names = {}
for root, dirs, files in os.walk('lyrics/' + str(year)):
for filename in files:
country = filename[:-4]
file = open("lyrics/" + str(year) + "/" + filename, "r");
text = file.read()
occurrence_names = process(occurrence_names, country, text)
for word in occurrence_names:
if word in common_words:
continue
count = len(occurrence_names[word])
if not word in total_occurrence_count:
total_occurrence_count[word] = 0
total_occurrence_count[word] = total_occurrence_count[word] + count
# Make a frequency table
frequency_table = [[] for i in range(0, 25)]
for word in total_occurrence_count:
avg = int(math.floor(total_occurrence_count[word]/len(years)))
frequency_table[avg].append(word)
# Print word lists
print "Word lists:"
for i in range(0, len(files) + 1):
if len(frequency_table[i]):
print str(i) + ": " + str(frequency_table[i])
# We have five buckets, each of which will fill five squares on any card.
# We fill the buckets with words of different frequencies.
# If you want more of a challenge, add or switch in frequency_table[1],
# which is all the words which only came up once.
orig_buckets = [set(frequency_table[2]),
set(frequency_table[3]),
set(frequency_table[4]),
set(frequency_table[5]),
set(frequency_table[6] + frequency_table[7] +
frequency_table[8] +
frequency_table[9] + frequency_table[10] +
frequency_table[11] + frequency_table[12] +
frequency_table[13] + frequency_table[14])]
buckets = copy.deepcopy(orig_buckets)
print "\nBucket sizes:"
minsize = 999
for i in range(len(orig_buckets)):
print len(orig_buckets[i])
minsize = min(minsize, len(orig_buckets[i]))