hamRadioServer1-3.py

# server.py version 1.3
# hamRadio
# Edits: adding user data functionality. Mainly psuedocode
# Aakash Indurkhya, Sarthak Sahu, Matt Conroy
# Created: May 15, 2015

import SocketServer
import json
import urllib2
import time
import math
from BaseHTTPServer import BaseHTTPRequestHandler
import urlparse
import random

# Dictionary holding local copy of database
echonest_attributes = {}

keyChoice = 0
apiKey = ['key1', 'key2', 'key3']


class GetHandler(BaseHTTPRequestHandler):

    def do_GET(self):
        parsed_path = urlparse.urlparse(self.path)
        print parsed_path.path

        currentPlaylist = parsed_path.path.strip('/').strip('[').strip(']').split(',')
        if currentPlaylist == [''] or len(currentPlaylist[0]) < 3:
            tracksToAdd = 'empty'
        else: 
            tracksToAdd = computeNewTracks(currentPlaylist)

        self.send_response(200)
        self.send_header('Last-Modified', self.date_time_string(time.time()))
        self.send_header("Access-Control-Allow-Origin", "*")
        self.send_header("Access-Control-Expose-Headers","Access-Control-Allow-Origin")
        self.send_header("Access-Control-Allow-Headers","Origin, X-Requested-With, Content-Type, Accept")
        self.end_headers()
        self.wfile.write(str(tracksToAdd))
        return

# this function parses the request and gets the relevant information
def constructInput(data):
    # first thing in the path should be the user id
    m1 = data.index('$')
    m2 = data[m1:].index('$')
    item = data[m1:m2]
    data = data[m2:]
    item = parseItem(item)
    user = item[1]

    playlists = {}

    # get playlists
    while data is not '':
        m1 = data.index('$')
        m2 = data[m1:].index('$')
        item = data[m1:m2]
        data = data[m2:]
        item = parseItem(item)
        playlists[item[0]] = item[1]

    # update cache of user profile
    # return user and hamRadio playlist to handler function
# parse a single item from the input string
def parseItem(item):
    colon_idx = item.index(':')
    key = item[:colon_idx]
    val = item[colon_idx+1:].split(',')
    return [key, val]

# initiates algorithm and finally returns the new tracks generated by the
# algorithm. 
def computeNewTracks(seedList):
    shortList = []
    playlist = [[seed, fetchFromSpotify(seed, 'track')[1]] for seed in seedList]
    
    # for each seed track
    for seed in [random.choice(seedList)]:
        # get seed profile
        seedProfile = fetchFromSpotify(seed, 'track')
        seedArtist = seedProfile[0]
        seedName = seedProfile[1]

        print seedName

        if fetchEchonestAttributes(seed):
            seedAttributes = echonest_attributes[seed]
            print seedAttributes
        else:
            print 'bleh'
            continue

        # fetch related artists for seed 
        relArtists = fetchFromSpotify(seedArtist, 'seed_artist')[:5]
        relArtists.append(seedArtist)

        # albums contains albums by all artists
        albums = []

        # tracks contains tracks from albums
        tracks = []

        # fetch albums for related artists; only take singles and real albums
        # (filter out 'appears on' and 'compilation')
        print 'getting albums'
        for artist in relArtists:
            albumsByArtist = fetchFromSpotify(artist, 'artist')[:4]
            for album in albumsByArtist:
                albums.append(album)

        # fetch tracks from albums
        print 'getting candidates'
        for album in albums:
            tracksOnAlbum = fetchFromSpotify(album, 'album')
            tracksOnAlbum = tracksOnAlbum
            for track in tracksOnAlbum:
                if fetchEchonestAttributes(track[0]):
                    if track not in tracks:
                        tracks.append(track)

        # candidates contains the track_id and the hamRadio distance to seed
        candidates = []

        # filter candidates by lev dist 
        tracks = filterCandidateList(playlist, tracks)
        candidates = [[track[0], track[1], getHamRadioDistance(seed, track[0])] for track in tracks]

        # sort candidates
        candidates = sortByDist(candidates)

        print 'sorted candidates'
        print candidates[:10]

        # add top candidates to short list
        for i in range(1):
            shortList.append(candidates[i])

    # make selections for new tracks from short list
    selection = finalSelection(seedList, shortList)
    print selection
    print computeSimilarity(seedName, selection[0][1])
    return selection[0][0]


# take in the shortList of candidates and compute final selections for songs to
# add to the playlist
def finalSelection(currentPlaylist, shortList):
    final_selection = []
    # right now this function is very simple, unpack the track id's for each 
    # track in the short list. 
    for track in shortList: 
        final_selection.append(track)

    return final_selection
    
# Loads in database of echonest attributes for each track id
def loadEchonestAttributes():
    # Filename of database
    database_file = '../data/attributes.txt'

    # Dictionary structure to be filled and returned
    database = open(database_file, 'r')
    for line in database:
        row = line.rstrip().split(',') # Removes \n character and splits on ','
        values = [] # List that will contain all 8 attributes for a given track_id
        for i in range (1, 14):
            # If echonest has a valid attribute
            if row[i] != 'None':
                values.append(float(row[i]))

            # If no parameter received from echnoest, set to inf
            else: 
                values.append(float('inf')) 
        echonest_attributes[row[0]] = values # Add track_id : attributes to dictionary
    # database.close()
    # database = open(database_file, 'a')
    # database.write('\n')
    # database.close()


# Ensures given track_id has corresponding attributes in dictionary
def fetchEchonestAttributes(track_id):
    # First check if track_id is aready in dictionary of attributes
    if track_id in echonest_attributes:
        return True

    # If it is not, then we need to query echonest to get the attributes for track_id
    else:
        global keyChoice
        global apiKey
        keyChoice += 1
        keyChoice = keyChoice % 3

        URL = 'http://developer.echonest.com/api/v4/track/profile?api_key=' + apiKey[keyChoice] + '&id=spotify:track:' + track_id + '&bucket=audio_summary'
        data = urllib2.urlopen(URL)
        trackSummary = json.loads(data.read())

        # Too many requests to echonest
        if trackSummary['response']['status']['code'] == 3:
            return False

        # Deal with other errors in echonest db
        elif 'track' not in trackSummary['response'].keys():
            return False 

        # If track_id couldn't be found
        elif 'audio_summary' not in trackSummary['response']['track'].keys():
            return False

        # Some other echonest error
        elif trackSummary['response']['status']['code'] != 0:
            return False

        # If no errors so far, try to get relevant info
        try: 
            summary = trackSummary['response']['track']['audio_summary']
        except KeyError, e: 
            return False

        values = []
        keys = summary.keys()
        # If all keys exist then create list to be returned
        if keys == ['key','tempo','energy','liveness','analysis_url','speechiness','acousticness','instrumentalness','mode','time_signature','duration','loudness','valence','danceability']:
            for param in keys:
                # Ignore url
                if param == 'analysis_url':
                    continue

                # Check if value is none
                if summary[param]:
                    values.append(float(summary[param]))

                # If no parameter received from echnoest, set to inf
                else: 
                    values.append(float('inf')) 

            # Add attributes to dictionary
            echonest_attributes[track_id] = values

            # Add attributes to database file
            database_file = '../data/track_ids70_attributes.txt'
            database = open(database_file, 'a')
            track_sum = '\n'
            track_sum += track_id + "," 
            for val in values: 
                track_sum += str(val) + ','
            track_sum = track_sum.strip(',')
            database.write(track_sum)
            database.close()
            return True
        else:
            return False

# Computes hamRadio distance between two tracks. 
# Ensure that tracks are in dictionary before calling this function.
def getHamRadioDistance(seed, track):
    seed_vals = echonest_attributes[seed]
    track_vals = echonest_attributes[track]
    hamRadio_dist = 0
    # only using reasonable metrics
    whiteList = [0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]
    for i in range (0, len(seed_vals)):
        if whiteList[i]:
            current_term = (track_vals[i] - seed_vals[i]) ** 2
            if current_term == float('inf') or math.isnan(current_term):
                continue
            else:
                hamRadio_dist += current_term
    hamRadio_dist = hamRadio_dist ** 0.5
    return hamRadio_dist

# Sorts a given list of lists by hamRadio_dist. Input list should be of the form:
# [[track_id1, hamRadio_dist1], [track_id2, hamRadio_dist2], [track_id3, hamRadio_dist3]]
def sortByDist(distances):
    return sorted(distances, key = lambda x: float(x[2]))

# Computes the longest common substring
def longest_common_substring(s1, s2):
    m = [[0] * (1 + len(s2)) for i in xrange(1 + len(s1))]
    longest, x_longest = 0, 0
    for x in xrange(1, 1 + len(s1)):
        for y in xrange(1, 1 + len(s2)):
            if s1[x - 1] == s2[y - 1]:
                m[x][y] = m[x - 1][y - 1] + 1
                if m[x][y] > longest:
                    longest = m[x][y]
                    x_longest = x
            else:
                m[x][y] = 0
    return s1[x_longest - longest: x_longest]

# Optimized similarity testing. Returns a value between 0 and 1.
def computeSimilarity(track_1, track_2):
    track_1 = track_1.rstrip()
    track_2 = track_2.rstrip()
    track_1_list = track_1.replace('-', ' ').split()
    track_2_list = track_2.replace('-', ' ').split()

    # Checking for invalid inputs
    if len(track_1) <= 0 or len(track_2) <= 0:
        return 1

    # Checking if a string is entirely contained in another and is longer than two words
    elif track_1 == track_2:
	print track_1 + '\t' + track_2
        return 1

    elif (track_1 in track_2 and len(track_1_list) > 2) or (track_2 in track_1 and len(track_2_list) > 2):
        return 1

    # Counts number of common words across 2 strings
    num_common_words = 0
    for word in track_1_list:
        if word in track_2_list:
            num_common_words += 1
            track_1_list.remove(word)
            track_2_list.remove(word)
            continue

    # Checks if track names are identical
    if track_1 == track_2:
	print track_1 + '\t' + track_2
        return 1
        
    # Resets track_1_list and track_2_list
    track_1_list = track_1.replace('-', ' ').split()
    track_2_list = track_2.replace('-', ' ').split()

    common_word_ratio = 2.0 * num_common_words / (len(track_1_list) + len(track_2_list))

    # lcs = longest_common_substring
    lcs = longest_common_substring(track_1, track_2)
    lcs_ratio = 2.0 * len(lcs) / (len(track_1) + len(track_2))

    # Check how many starting words are the same
    pos = 0
    while pos < len(track_1_list) and pos < len(track_2_list) and track_1_list[pos] == track_2_list[pos]:
        # Check if first word is followed by hyphen
        if (track_1_list[pos] + ' -') in track_2 or (track_2_list[pos] + ' -') in track_1:
            return 1
        pos += 1
    
    if pos == 0:
        return max(common_word_ratio, lcs_ratio)
    else:
        return max(common_word_ratio, lcs_ratio) ** (1/pos)
    

def filterCandidateList(playlist, candidate_list):
    for playlist_track in playlist:
        for candidate_track in candidate_list:
            if 'Commentary' in candidate_track[1] or ' - Live' in candidate_track[1]:
                candidate_list.remove(candidate_track)
                continue
            elif computeSimilarity(playlist_track[1], candidate_track[1]) > 0.6:
                candidate_list.remove(candidate_track)
                continue
    # print candidate_list
    return candidate_list

# Take spotify internal id, track/album, list of what to return

def fetchTrackProfile(id):
  URL = 'http://api.spotify.com/v1/tracks/' + id
  data = urllib2.urlopen(URL)
  if data.getcode() != 200:
    return []
  jdata = data.read()
  jsonObj = json.loads(jdata)
  artist = jsonObj['artists'][0]['id']
  songName = jsonObj['name']
  return [artist, songName]

def fetchArtistAlbums(id):
  URL = 'http://api.spotify.com/v1/artists/' + id + '/albums'
  data = urllib2.urlopen(URL)
  if data.getcode() != 200:
    return []
  jdata = data.read()
  jsonObj = json.loads(jdata)
  albums = []
  for i in jsonObj['items']:
    if 'appears_on' in i['album_type'] or 'compilation' in i['album_type']:
      continue
    else:
      albums.append(i['id'])
  return albums

def fetchAlbumTracks(id):
  URL = 'http://api.spotify.com/v1/albums/' + id + '/tracks'
  data = urllib2.urlopen(URL)
  if data.getcode() != 200:
    return []
  jdata = data.read()
  jsonObj = json.loads(jdata)
  tracks = []
  for i in jsonObj['items']:
    tracks.append((i['id'],  i['name']))
  return tracks

def fetchRelatedArtists(id):
  URL = 'http://api.spotify.com/v1/artists/' + id + '/related-artists'
  data = urllib2.urlopen(URL)

  if data.getcode() != 200:
    return []
  jdata = data.read()
  jsonObj = json.loads(jdata)
  relatedArtist_id = []
  for i in jsonObj['artists']:
    relatedArtist_id.append(i['id'])
  return relatedArtist_id

def fetchFromSpotify(id, type):

  if type == 'track':
    return fetchTrackProfile(id)
  elif type == 'artist':
    return fetchArtistAlbums(id)
  elif type == 'album':
    return fetchAlbumTracks(id)
  elif type == 'seed_artist':
    return fetchRelatedArtists(id)
  else:
    return []


if __name__ == '__main__':
    from BaseHTTPServer import HTTPServer
    loadEchonestAttributes()
    server = HTTPServer(('', 8080), GetHandler)
    server.request_queue_size = 200
    print 'Starting server, use <Ctrl-C> to stop'
    server.serve_forever()