diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 9c7d342d..d9350952 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -168,57 +168,63 @@ def sanitize_token(self, token): if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"]): if token["name"] in self.allowed_elements: - if "data" in token: - attrs = dict([(name,val) for name,val in - token["data"][::-1] - if name in self.allowed_attributes]) - for attr in self.attr_val_is_uri: - if attr not in attrs: - continue - val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', - unescape(attrs[attr])).lower() - #remove replacement characters from unescaped characters - val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and - (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] - for attr in self.svg_attr_val_allows_ref: - if attr in attrs: - attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', - ' ', - unescape(attrs[attr])) - if (token["name"] in self.svg_allow_local_href and - 'xlink:href' in attrs and re.search('^\s*[^#\s].*', - attrs['xlink:href'])): - del attrs['xlink:href'] - if 'style' in attrs: - attrs['style'] = self.sanitize_css(attrs['style']) - token["data"] = [[name,val] for name,val in list(attrs.items())] - return token + return self.allowed_token(token, token_type) else: - if token_type == tokenTypes["EndTag"]: - token["data"] = "" % token["name"] - elif token["data"]: - attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) - token["data"] = "<%s%s>" % (token["name"],attrs) - else: - token["data"] = "<%s>" % token["name"] - if token.get("selfClosing"): - token["data"]=token["data"][:-1] + "/>" - - if token["type"] in list(tokenTypes.keys()): - token["type"] = "Characters" - else: - token["type"] = tokenTypes["Characters"] - - del token["name"] - return token + return self.disallowed_token(token, token_type) elif token_type == tokenTypes["Comment"]: pass else: return token + def allowed_token(self, token, token_type): + if "data" in token: + attrs = dict([(name,val) for name,val in + token["data"][::-1] + if name in self.allowed_attributes]) + for attr in self.attr_val_is_uri: + if attr not in attrs: + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + #remove replacement characters from unescaped characters + val_unescaped = val_unescaped.replace("\ufffd", "") + if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and + (val_unescaped.split(':')[0] not in + self.allowed_protocols)): + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + if (token["name"] in self.svg_allow_local_href and + 'xlink:href' in attrs and re.search('^\s*[^#\s].*', + attrs['xlink:href'])): + del attrs['xlink:href'] + if 'style' in attrs: + attrs['style'] = self.sanitize_css(attrs['style']) + token["data"] = [[name,val] for name,val in list(attrs.items())] + return token + + def disallowed_token(self, token, token_type): + if token_type == tokenTypes["EndTag"]: + token["data"] = "" % token["name"] + elif token["data"]: + attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) + token["data"] = "<%s%s>" % (token["name"],attrs) + else: + token["data"] = "<%s>" % token["name"] + if token.get("selfClosing"): + token["data"]=token["data"][:-1] + "/>" + + if token["type"] in list(tokenTypes.keys()): + token["type"] = "Characters" + else: + token["type"] = tokenTypes["Characters"] + + del token["name"] + return token + def sanitize_css(self, style): # disallow urls style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)