-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_hashtag.py
75 lines (65 loc) · 2.6 KB
/
extract_hashtag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os, sys
import csv, json
import matplotlib.pyplot as plt
def get_hashtags(obj):
if not obj:
print(f'ERROR: Empty item, no hashtags to be extracted.')
return
else:
hashtags = {}
l = len(obj)
for i in range(l):
for hashtag in obj[i]['hashtags']:
if hashtag['name'] in hashtags:
hashtags[hashtag['name']].add(i)
else:
hashtags[hashtag['name']] = {i}
return hashtags
def create_csv(file_name, path, d):
with open(path, "w") as f:
f.write(f"Name, Occurances, Positions" + "\n")
for key,value in d.items():
f.write(f"{key}, {value[0]}, " + f"{value[1]}".replace(",", ";") + "\n")
print(f'The sorted hashtag occcurances list is contained in the file {path}.')
return None
def plot_occurances(file_name, plots):
base = os.path.splitext(file_name)[0]
path = f"./{base}_sorted_hashtags.csv"
if os.path.exists(path):
print(f'The file {path} containing hashtag occurances already exists. If you would like to generate a plot, please delete the file {path} and re-run the script.')
return
else:
with open(file_name) as f:
obj = json.load(f)
l = len(obj)
tags = get_hashtags(obj)
tags = {key: (len(value), value) for (key, value) in tags.items()}
sorted_tags = {k: v for k,v in sorted(tags.items(), key=lambda item: item[1], reverse=True)}
create_csv(file_name, path, sorted_tags)
k = list(sorted_tags.keys())
v = list(sorted_tags.values())
v = [i[0] for i in v]
k = k[:plots]
v = v[:plots]
plt.scatter(k, v)
plt.tight_layout()
plt.title(f'Hashtag Distribution')
plt.xlabel(f'Top {plots} hashtags from {l} posts.')
plt.ylabel(f'Number of occurances')
plt.show()
return
if __name__ == "__main__":
if len(sys.argv) != 3:
print(f'ERROR: Please make sure you enter the following in the command line: python3 file.json n. Where n is a positive integer value and will plot top n hashtags in the number of occurances.')
sys.exit()
else:
try:
int(sys.argv[2])
except:
print(f'ERROR: Please make sure the number in the command line input: python3 file.json n, is a positive integer.')
raise
try:
plot_occurances(sys.argv[1], int(sys.argv[2]))
except:
print("Unexpected error:", sys.exc_info()[0])
raise