-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopartists.py
31 lines (31 loc) · 1.19 KB
/
topartists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#===========================================================================
from pyspark import SparkConf, SparkContext
import re
from song import Song
#===========================================================================
#setup spark
#(cluster)
confCluster = SparkConf()
confCluster.setAppName("Spark Test Cluster")
#(local machine)
confLocal = SparkConf().setMaster("local").setAppName("Spark Test Local")
#add custom song class
sc = SparkContext(conf = confCluster, pyFiles=["song.py"])
#===========================================================================
#load data
data = sc.textFile("data.txt")
#interpret data
songs = data.map(lambda line: Song(line))
#filter artists and create (name,1) for each artist
artists = songs.map(lambda song: song.Artist).map(lambda name: (name,1))
#reduce artists
artists = artists.reduceByKey(lambda a,b:a+b).map(lambda x: (x[1],x[0]))
#sort keys
artists = artists.sortByKey(False)
#write all artists
text_file = open("topartists.txt", "w")
for artist in artists.collect():
text_file.write(str(artist[0]) + ", " + artist[1].encode("utf8"))
text_file.write("\n")
text_file.close()
#===========================================================================