-
Notifications
You must be signed in to change notification settings - Fork 1
/
frequency.py
39 lines (27 loc) · 1014 Bytes
/
frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import sys
import json
def wordCount(tweet_file):
data = []
totalWord = 0 # total number of words
wordCount = {} # occurence of each word
for line in tweet_file:
data.append(json.loads(line))
for tweet in data:
if tweet.has_key("text"): # get the text of tweet
tokens = tweet["text"].lower().split(" ")
totalWord += len(tokens) # calculate total numner of words in collection
for token in tokens:
if wordCount.has_key(token):
wordCount[token] = wordCount[token]+1 # calculate each word count
else:
wordCount[token] = 1
printFrequency(totalWord, wordCount) # print frequency
def printFrequency(totalWord, wordCount):
for word in wordCount.keys():
frequency = wordCount[word]/float(totalWord)
print word.lstrip(),frequency
def main():
tweet_file = open(sys.argv[1])
wordCount(tweet_file)
if __name__ == '__main__':
main()