diff --git a/scrape.py b/scrape.py index 82d9d2e..d4287c1 100755 --- a/scrape.py +++ b/scrape.py @@ -11,6 +11,7 @@ import base64 import json +import re import requests from lxml import html from typing import Optional, Iterable, Dict, Tuple @@ -205,8 +206,16 @@ def get_user_info(username): return _get_user_info(username) +username_regex = re.compile(r'^[a-z][\w-]{1,14}$', flags=re.ASCII) + + def username_exists(username): - if get_user_info(username): + ''' From last.fm sign up page: + "Your username should be between 2 and 15 characters, begin with a letter and contain only letters, numbers, '_' or '-'" + Check username first locally for this format, and then remote for existence. + ''' + username = username.lower() + if (username == 'last.hq' or re.match(username_regex, username)) and get_user_info(username): return True diff --git a/tests.py b/tests.py new file mode 100644 index 0000000..40783dd --- /dev/null +++ b/tests.py @@ -0,0 +1,26 @@ +import unittest +import re + +from scrape import username_regex + + +class TestUserName(unittest.TestCase): + def test_username_regex(self): + username_list = [ + ('a1', True), + ('a1234567890', True), + ('a1234567890-_', True), + ('a12345678912345', True), + ('a', False), + ('a()', False), + ('aa ', False), + ('123', False), + ('a123456789123456', False), + ] + for username, valid in username_list: + with self.subTest(username=username): + self.assertEqual(re.match(username_regex, username) is not None, valid) + + +if __name__ == '__main__': + unittest.main() diff --git a/util.py b/util.py index abf2286..95639cd 100644 --- a/util.py +++ b/util.py @@ -66,7 +66,7 @@ def get_recent_users(): with open(RECENT_USERS_FILE) as f: return f.read() except FileNotFoundError: - return "" + return "last.hq" def add_recent_user(username): diff --git a/utils/api.py b/utils/api.py index 10ad4b7..3ad2205 100644 --- a/utils/api.py +++ b/utils/api.py @@ -68,7 +68,7 @@ def _get_user_info(username): print("Getting " + url.replace(API_KEY, 'SECRET')) from scrape import TIMEOUT resp = session.get(url, timeout=TIMEOUT) - if resp.status_code == 404: + if resp.status_code != 200: return '' # Dump json as text so we can cache it to disk return resp.text