|
12 | 12 | # See the License for the specific language governing permissions and |
13 | 13 | # limitations under the License. |
14 | 14 |
|
15 | | -import functools |
16 | | -from typing import Any, Dict, Iterator, List, Optional |
| 15 | +from typing import Dict, Optional, Set |
17 | 16 |
|
18 | | -import bleach |
19 | | -import bleach.callbacks |
20 | | -import bleach.linkifier |
21 | | -import bleach.sanitizer |
| 17 | +import nh3 |
22 | 18 |
|
23 | 19 |
|
24 | | -ALLOWED_TAGS = [ |
| 20 | +ALLOWED_TAGS = { |
25 | 21 | # Bleach Defaults |
26 | 22 | "a", "abbr", "acronym", "b", "blockquote", "code", "em", "i", "li", "ol", |
27 | 23 | "strong", "ul", |
|
32 | 28 | "span", "sub", "summary", "sup", "table", "tbody", "td", "th", "thead", |
33 | 29 | "tr", "tt", "kbd", "var", "input", "section", "aside", "nav", "s", "figure", |
34 | 30 | "figcaption", |
35 | | -] |
| 31 | +} |
36 | 32 |
|
37 | 33 | ALLOWED_ATTRIBUTES = { |
38 | 34 | # Bleach Defaults |
39 | | - "a": ["href", "title"], |
40 | | - "abbr": ["title"], |
41 | | - "acronym": ["title"], |
| 35 | + "a": {"href", "title"}, |
| 36 | + "abbr": {"title"}, |
| 37 | + "acronym": {"title"}, |
42 | 38 |
|
43 | 39 | # Custom Additions |
44 | | - "*": ["id"], |
45 | | - "hr": ["class"], |
46 | | - "img": ["src", "width", "height", "alt", "align", "class"], |
47 | | - "span": ["class"], |
48 | | - "th": ["align", "class"], |
49 | | - "td": ["align", "colspan", "rowspan"], |
50 | | - "div": ["align", "class"], |
51 | | - "h1": ["align"], |
52 | | - "h2": ["align"], |
53 | | - "h3": ["align"], |
54 | | - "h4": ["align"], |
55 | | - "h5": ["align"], |
56 | | - "h6": ["align"], |
57 | | - "code": ["class"], |
58 | | - "p": ["align", "class"], |
59 | | - "pre": ["lang"], |
60 | | - "ol": ["start"], |
61 | | - "input": ["type", "checked", "disabled"], |
62 | | - "aside": ["class"], |
63 | | - "dd": ["class"], |
64 | | - "dl": ["class"], |
65 | | - "dt": ["class"], |
66 | | - "ul": ["class"], |
67 | | - "nav": ["class"], |
68 | | - "figure": ["class"], |
| 40 | + "*": {"id"}, |
| 41 | + "hr": {"class"}, |
| 42 | + "img": {"src", "width", "height", "alt", "align", "class"}, |
| 43 | + "span": {"class"}, |
| 44 | + "th": {"align", "class"}, |
| 45 | + "td": {"align", "colspan", "rowspan"}, |
| 46 | + "div": {"align", "class"}, |
| 47 | + "h1": {"align"}, |
| 48 | + "h2": {"align"}, |
| 49 | + "h3": {"align"}, |
| 50 | + "h4": {"align"}, |
| 51 | + "h5": {"align"}, |
| 52 | + "h6": {"align"}, |
| 53 | + "code": {"class"}, |
| 54 | + "p": {"align", "class"}, |
| 55 | + "pre": {"lang"}, |
| 56 | + "ol": {"start"}, |
| 57 | + "input": {"type", "checked", "disabled"}, |
| 58 | + "aside": {"class"}, |
| 59 | + "dd": {"class"}, |
| 60 | + "dl": {"class"}, |
| 61 | + "dt": {"class"}, |
| 62 | + "ul": {"class"}, |
| 63 | + "nav": {"class"}, |
| 64 | + "figure": {"class"}, |
69 | 65 | } |
70 | 66 |
|
71 | 67 |
|
72 | | -class DisabledCheckboxInputsFilter: |
73 | | - # The typeshed for bleach (html5lib) filters is incomplete, use `typing.Any` |
74 | | - # See https://github.com/python/typeshed/blob/505ea726415016e53638c8b584b8fdc9c722cac1/stubs/bleach/bleach/html5lib_shim.pyi#L7-L8 # noqa E501 |
75 | | - def __init__(self, source: Any) -> None: |
76 | | - self.source = source |
77 | | - |
78 | | - def __iter__(self) -> Iterator[Dict[str, Optional[str]]]: |
79 | | - for token in self.source: |
80 | | - if token.get("name") == "input": |
81 | | - # only allow disabled checkbox inputs |
82 | | - is_checkbox, is_disabled, unsafe_attrs = False, False, False |
83 | | - for (_, attrname), value in token.get("data", {}).items(): |
84 | | - if attrname == "type" and value == "checkbox": |
85 | | - is_checkbox = True |
86 | | - elif attrname == "disabled": |
87 | | - is_disabled = True |
88 | | - elif attrname != "checked": |
89 | | - unsafe_attrs = True |
90 | | - break |
91 | | - if is_checkbox and is_disabled and not unsafe_attrs: |
92 | | - yield token |
93 | | - else: |
94 | | - yield token |
95 | | - |
96 | | - def __getattr__(self, name: str) -> Any: |
97 | | - return getattr(self.source, name) |
98 | | - |
99 | | - |
100 | 68 | def clean( |
101 | 69 | html: str, |
102 | | - tags: Optional[List[str]] = None, |
103 | | - attributes: Optional[Dict[str, List[str]]] = None |
| 70 | + tags: Optional[Set[str]] = None, |
| 71 | + attributes: Optional[Dict[str, Set[str]]] = None |
104 | 72 | ) -> Optional[str]: |
105 | 73 | if tags is None: |
106 | 74 | tags = ALLOWED_TAGS |
107 | 75 | if attributes is None: |
108 | 76 | attributes = ALLOWED_ATTRIBUTES |
109 | 77 |
|
110 | | - # Clean the output using Bleach |
111 | | - cleaner = bleach.sanitizer.Cleaner( |
112 | | - tags=tags, |
113 | | - attributes=attributes, |
114 | | - filters=[ |
115 | | - # Bleach Linkify makes it easy to modify links, however, we will |
116 | | - # not be using it to create additional links. |
117 | | - functools.partial( |
118 | | - bleach.linkifier.LinkifyFilter, |
119 | | - callbacks=[ |
120 | | - lambda attrs, new: attrs if not new else None, |
121 | | - bleach.callbacks.nofollow, |
122 | | - ], |
123 | | - skip_tags=["pre"], |
124 | | - parse_email=False, |
125 | | - ), |
126 | | - DisabledCheckboxInputsFilter, |
127 | | - ], |
128 | | - ) |
129 | 78 | try: |
130 | | - cleaned = cleaner.clean(html) |
| 79 | + cleaned = nh3.clean( |
| 80 | + html, |
| 81 | + tags=ALLOWED_TAGS, |
| 82 | + attributes=ALLOWED_ATTRIBUTES, |
| 83 | + link_rel="nofollow", |
| 84 | + url_schemes={"http", "https", "mailto"}, |
| 85 | + ) |
| 86 | + |
131 | 87 | return cleaned |
132 | 88 | except ValueError: |
133 | 89 | return None |
0 commit comments