-
Notifications
You must be signed in to change notification settings - Fork 0
/
vortex_juice.py
134 lines (93 loc) · 3.99 KB
/
vortex_juice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from urllib import urlopen
from re import match, search, findall, finditer
from random import sample
from xml.dom.minidom import parseString, parse
class FacultyIndex:
def __init__( self, url="http://www.uio.no/studier/emner/" ):
self.url = url
self.page = urlopen( self.url ).read()
self.excluded = set( ["alfabetisk", "alphabetical", "nedlagt" ] ) #"index", "v12", "v13", "h12"] )
self.prefix = "emner/"
def get_faculties( self ):
pattern = self.prefix + "\w+"
relatives = findall( pattern, self.page )
# remove prefix
relatives = [l.replace( self.prefix, "" ) for l in relatives ]
# gather unique
relatives = set(relatives)
# remove excluded
relatives = relatives.difference( self.excluded )
return relatives
class InstituteIndex:
def __init__( self, faculties, url="http://www.uio.no/studier/emner/" ):
self.url = url
self.page = urlopen( self.url ).read()
self.faculties = faculties
self.prefix = "studier/emner/"
def get_all_institutes( self ):
institutes = []
for f in self.faculties:
pattern = "%s%s/%s" % (self.prefix, f, "\w+/")
relatives = findall( pattern, self.page )
relatives = [l.replace( self.prefix + f, "" ) for l in relatives ]
institutes += relatives
return set(institutes)
def get_institutes_of_faculty( self, faculty ):
pattern = "%s%s/%s" % (self.prefix, faculty, "\w+/")
relatives = findall( pattern, self.page )
relatives = [l.replace( self.prefix + faculty, "" ) for l in relatives ]
return set(relatives)
class FacultyData:
def __init__( self, url="http://www.uio.no/studier/emner/" ):
self.data = urlopen( url ).read()
def get_subject_descriptions( self ):
model = parseString(self.data)
clean = model.getElementsByTagName( "ul" )
clean = [e for e in clean if e.getAttribute("class") == "main"]
clean = [e.childNodes for e in clean]
clean = clean[0]
subjects = [e.firstChild for e in clean]
subjects = [e.firstChild.toxml() for e in subjects]
return subjects
def get_precise_subject_quantity( self ):
model = parseString( self.data )
clean = model.getElementsByTagName( "h2" )
clean = clean.item(0).firstChild.nodeValue
number = findall( "\d+", clean )
return number
def verify_number_of_subjects( self ):
number = self.get_subject_quantity( self.data )
return(
int(number[0]) == len( self.get_subject_descriptions() )
)
def get_subject_codes( self ):
complex_subject = "(\w{1,5}-\w{1,5}\d{1,5})+"
regular_subject = "(\w{1,5}\d{1,5})+"
#exceptions = "([AZ]){1,10}"
pattern = "(" + complex_subject + "|" + regular_subject + ")+"
result = [ match(pattern, d)
for d in self.get_subject_descriptions() ]
result = [m.group(1) for m in result if m != None]
return result
def save( self, name ):
text = "\n".join( self.get_subject_codes() )
with open( "data/%s.dat" % name, "w" ) as output:
output.write(text)
if __name__ == "__main__":
faculties = ( FacultyIndex() ).get_faculties()
institutes = InstituteIndex( faculties )
structure = {}
for f in faculties:
members = institutes.get_institutes_of_faculty( f )
if len(members) > 0:
structure[f] = members
#structure["annet"].add( "/teologi/" )
print structure
for (faculty, institutes) in structure.iteritems():
for i in institutes:
url = "http://www.uio.no/studier/emner/%s/%s" % (faculty, i)
data = FacultyData( url=url )
filename = "%s" % (faculty)
if i != "":
filename = "%s-%s" % (faculty, i.replace("/", "") )
data.save( filename )