-
Notifications
You must be signed in to change notification settings - Fork 11
/
top_ten.py
102 lines (71 loc) · 2.8 KB
/
top_ten.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# top_ten.py
# Author: Tom Ravenscroft
import sys
import json
import re
import operator
from operator import itemgetter
# -------------------------------------------------------------------------------
# ingest_scores()
# -------------------------------------------------------------------------------
def ingest_scores(fn):
sentiment_file = open(fn)
# initialize an empty dictionary
scores = {}
for line in sentiment_file:
term, score = line.split("\t") # The file is tab-delimited. "\t" means "tab character"
scores[term] = int(score) # Convert the score to an integer.
# Return the complete dictionary.
return scores
# -------------------------------------------------------------------------------
# ingest_tweets()
# -------------------------------------------------------------------------------
# Build a dictionary containing tweet text.
def ingest_tweets(fn):
tweets_file = open(fn)
# Initialise empty array
tweets = []
for tweet in tweets_file:
# Parse input strings as JSON.
json_tweet = json.loads(tweet)
tweets.append(json_tweet)
# Check that there is a text field.
#if "text" in json_tweet.keys():
# text = json_tweet["text"].encode('utf-8')
# tweets.append(text)
return tweets
# -------------------------------------------------------------------------------
# extract_htags()
# -------------------------------------------------------------------------------
def extract_htags(tweets):
htags = []
for tweet in tweets:
# Ensure that there is an entities element to extract.
if "entities" in tweet.keys() and "hashtags" in tweet["entities"]:
for htag in tweet["entities"]["hashtags"]:
unicode_tag = htag["text"].encode('utf-8')
htags.append(unicode_tag)
return htags
# -------------------------------------------------------------------------------
# judge_sentiments()
# -------------------------------------------------------------------------------
def top_ten(htags):
frequencies = []
for htag in htags:
tup = [htag,htags.count(htag)]
if tup not in frequencies : frequencies.append(tup)
#frequencies[htag] = int(htags.count(htag))
frequencies_sorted = sorted(frequencies, key=itemgetter(1), reverse=True)
for i in range(0,10):
print frequencies_sorted[i][0] + " " + str(float(frequencies_sorted[i][1]))
# -------------------------------------------------------------------------------
# main()
# -------------------------------------------------------------------------------
def main():
# Populate the list of tweets.
tweets = ingest_tweets(sys.argv[1])
# Extract hash tags.
htags = extract_htags(tweets)
top_ten(htags)
if __name__ == '__main__':
main()