forked from geluso/kexp_song_search_engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
executable file
·145 lines (114 loc) · 3.72 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/local/bin/python
import argparse
import sqlite3
import re
from subprocess import call
from bs4 import BeautifulSoup
from datetime import datetime
def parse_date(filename):
# obtain the date from the filename
match = re.match(r".*(....-..-..)-(..).*", args.html)
date = match.group(1)
hour = match.group(2)
if hour == "00":
print filename
return date
def parse_play(play):
time = parse_airdate(play)
artist = parse_artist(play).encode("utf-8")
album = parse_album(play).encode("utf-8")
song = parse_song(play).encode("utf-8")
comment = parse_comment(play).encode("utf-8")
return {"time": time, "artist": artist, "album":album, "song": song, "comment": comment}
def parse_airdate(play):
date = play.find("div", class_="AirDate").span.text
time = date.split(" ")[0]
pm = date.split(" ")[1].upper()
hour = int(time.split(":")[0])
minute = int(time.split(":")[1])
if hour == 12 and pm == "AM":
hour = 0
elif hour == 12 and pm == "PM":
hour = 12
elif pm =="PM":
hour += 12
timestamp = "%02d:%02d" % (hour, minute)
return timestamp
def parse_artist(play):
div = play.find("div", class_="ArtistName")
return div.find("a").text
def parse_song(play):
div = play.find("div", class_="TrackName")
return div.text
def parse_album(play):
div = play.find("div", class_="ReleaseName")
return div.text
def parse_comment(play):
comment = play.find("div", class_="CommentText")
if (comment):
comment = comment.text
comment = comment.strip()
comment = re.sub("[\r\n\t]", "", comment)
return comment
else:
return ""
def parse(html):
html = open(html).read()
soup = BeautifulSoup(html, "html.parser")
plays = soup.find_all("div", "Play")
songs = []
for play in plays:
song = parse_play(play)
songs.append(song)
return songs
def print_songs(songs, date):
for song in songs:
print date, song["time"]
print song["artist"]
print song["song"]
print
def songs_to_db(db, date, songs):
for song in songs:
insert_db(db, date, song)
def setup_db(db):
cursor = sqlite3.connect(db)
cursor.text_factory = str
cursor.execute("CREATE TABLE IF NOT EXISTS plays (date datetime UNIQUE, artist text, song text, comment text);")
cursor.execute("CREATE TABLE IF NOT EXISTS artists (artist text UNIQUE);")
cursor.execute("CREATE TABLE IF NOT EXISTS songs (song text, artist text, UNIQUE(song, artist));")
cursor.commit()
return cursor
def try_execute(db, sql, args):
try:
db.execute(sql, args)
except sqlite3.IntegrityError:
pass
def insert_db(db, date, song):
# time includes hour and minute. second is always "00"
timestamp = "%s %s:00" % (date, song["time"])
# set comment to null if not present.
comment = song["comment"]
if not comment:
comment = "null"
song_name = song["song"]
artist = song["artist"]
add_play_sql = 'INSERT INTO plays VALUES(?, ?, ?, ?);'
add_artist_sql = 'INSERT INTO artists VALUES(?)'
add_song_sql = 'INSERT INTO songs VALUES(?, ?)'
try_execute(db, add_play_sql, (timestamp, artist, song_name, comment))
try_execute(db, add_artist_sql, (artist, ))
try_execute(db, add_song_sql, (song_name, artist))
db.commit()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Parses song, artist and playtime information from the KEXP playlist page.")
parser.add_argument("html", type=str, help="the HTML file containing KEXP playlist information for one hour.")
parser.add_argument("db", type=str, nargs="?", default="", help="the location of the sqlite database")
args = parser.parse_args()
songs = parse(args.html)
date = parse_date(args.html)
if args.db:
db = setup_db(args.db)
songs_to_db(db, date, songs)
db.close()
else:
print_songs(songs, date)