-
Notifications
You must be signed in to change notification settings - Fork 46
/
WASEHTMLParser.py
83 lines (76 loc) · 2.83 KB
/
WASEHTMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import sys
if sys.version_info[0] == 2:
from HTMLParser import HTMLParser
else:
from html.parser import HTMLParser
# extract values from attrList of attributes whose name is contained in attrNames
def add_attrs(attrNames, attrList):
return [a[1] for a in filter(lambda attr: attr[0] in attrNames, attrList)]
def has_attr(attrs, attr):
return attr in map(lambda kv: kv[0], attrs)
def attr_val_is(attrs, attr, val):
try:
return filter(lambda kv: kv[0] == attr, attrs)[0][1] == val
except:
return False
class WASEHTMLParser(HTMLParser, object):
def reset(self):
self.doctype = set()
self.base = set()
self.stylesheets = set()
self.frames = set()
self.scripts = set()
self.links = set()
self.images = set()
self.audio = set()
self.video = set()
self.objects = set()
self.formactions = set()
super(WASEHTMLParser, self).reset()
def handle_decl(self, decl):
self.doctype.add(decl)
def handle_starttag(self, tag, attrs):
if tag == "iframe":
self.frames.update(add_attrs(["src"], attrs))
elif tag == "base":
self.base.update(add_attrs(["href"], attrs))
elif tag == "link" and attr_val_is(attrs, "rel", "stylesheet"):
self.stylesheets.update(add_attrs(["href"], attrs))
elif tag == "script":
self.scripts.update(add_attrs(["src"], attrs))
elif tag == "a" or tag == "area":
self.links.update(add_attrs(["href"], attrs))
elif tag == "img" or tag == "input":
self.images.update(add_attrs(["src"], attrs))
elif tag == "svg" or tag == "image":
self.images.update(add_attrs(["href", "xlink:href"], attrs))
elif tag == "audio":
self.audio.update(add_attrs(["src"], attrs))
elif tag == "video":
self.video.update(add_attrs(["src"], attrs))
elif tag == "object":
self.objects.update(add_attrs(["data"], attrs))
elif tag == "embed":
self.objects.update(add_attrs(["src"], attrs))
elif tag == "applet":
self.objects.update(add_attrs(["code"], attrs))
elif tag == "form":
self.formactions.update(add_attrs(["action"], attrs))
elif tag == "input" or tag == "button":
self.formactions.update(add_attrs(["formaction"], attrs))
else:
return
def close(self):
self.extrefs = set()
self.extrefs.update(
self.stylesheets,
self.frames,
self.scripts,
self.links,
self.images,
self.audio,
self.video,
self.objects,
self.formactions
)
return super(WASEHTMLParser, self).close()