-
Notifications
You must be signed in to change notification settings - Fork 14
/
parse.py
26 lines (22 loc) · 1.09 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
"""
Code to parse AIMA file of links, such as `old-links.txt`.
"""
import collections
import re
Link = collections.namedtuple('Link', 'title, star, url, topics, genre, comment')
REGEX = re.compile(r"([^*]+?)([*]?)\s+((http|https|mailto|ftp):[^\s]+)\s+([a-z,]+)[.]([a-z,]+)\s*(.*)$")
TOPICS = {'agents', 'intro', 'java', 'learning', 'lisp', 'logic', 'nlp', 'phil', 'planning',
'prolog', 'python', 'robotics', 'search', 'uncertainty'}
GENRES = {'com', 'edu', 'humor', 'jour', 'list', 'news', 'org', 'people', 'ref', 'soft'}
def parse_link(line):
"Parse a line of text into a `Link` structure."
m = REGEX.match(line)
assert m, 'Line does not match: ' + line
title, star, url, _, topics, genre, comment = m.groups()
topics = set(topics.split(','))
assert genre in GENRES, 'Unknown genre: ' + genre
assert not (topics - TOPICS), 'Unknown topics: ' + str(topics - TOPICS)
return Link(title, star == '*', url, topics, genre, comment)
def parse_file(filename='old-links.txt'):
"Parse a file into a list of Links."
return [parse_link(line) for line in open(filename)]