From 496e69f536c229da1e1250201abfdc41a2587eca Mon Sep 17 00:00:00 2001 From: Andreas Madsack Date: Wed, 24 Apr 2013 14:35:26 +0200 Subject: [PATCH 1/3] refactor allowed_token and unallowed_token as new methods in HTMLSanitizerMixin for usage in subclass. --- html5lib/sanitizer.py | 96 +++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 45 deletions(-) diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 9c7d342d..7d2dc482 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -168,57 +168,63 @@ def sanitize_token(self, token): if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"]): if token["name"] in self.allowed_elements: - if "data" in token: - attrs = dict([(name,val) for name,val in - token["data"][::-1] - if name in self.allowed_attributes]) - for attr in self.attr_val_is_uri: - if attr not in attrs: - continue - val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', - unescape(attrs[attr])).lower() - #remove replacement characters from unescaped characters - val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and - (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] - for attr in self.svg_attr_val_allows_ref: - if attr in attrs: - attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', - ' ', - unescape(attrs[attr])) - if (token["name"] in self.svg_allow_local_href and - 'xlink:href' in attrs and re.search('^\s*[^#\s].*', - attrs['xlink:href'])): - del attrs['xlink:href'] - if 'style' in attrs: - attrs['style'] = self.sanitize_css(attrs['style']) - token["data"] = [[name,val] for name,val in list(attrs.items())] - return token + return self.allowed_token(token) else: - if token_type == tokenTypes["EndTag"]: - token["data"] = "" % token["name"] - elif token["data"]: - attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) - token["data"] = "<%s%s>" % (token["name"],attrs) - else: - token["data"] = "<%s>" % token["name"] - if token.get("selfClosing"): - token["data"]=token["data"][:-1] + "/>" - - if token["type"] in list(tokenTypes.keys()): - token["type"] = "Characters" - else: - token["type"] = tokenTypes["Characters"] - - del token["name"] - return token + return self.unallowed_token(token) elif token_type == tokenTypes["Comment"]: pass else: return token + def allowed_token(self, token): + if "data" in token: + attrs = dict([(name,val) for name,val in + token["data"][::-1] + if name in self.allowed_attributes]) + for attr in self.attr_val_is_uri: + if attr not in attrs: + continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', + unescape(attrs[attr])).lower() + #remove replacement characters from unescaped characters + val_unescaped = val_unescaped.replace("\ufffd", "") + if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and + (val_unescaped.split(':')[0] not in + self.allowed_protocols)): + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: + if attr in attrs: + attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', + ' ', + unescape(attrs[attr])) + if (token["name"] in self.svg_allow_local_href and + 'xlink:href' in attrs and re.search('^\s*[^#\s].*', + attrs['xlink:href'])): + del attrs['xlink:href'] + if 'style' in attrs: + attrs['style'] = self.sanitize_css(attrs['style']) + token["data"] = [[name,val] for name,val in list(attrs.items())] + return token + + def unallowed_token(self, token): + if token_type == tokenTypes["EndTag"]: + token["data"] = "" % token["name"] + elif token["data"]: + attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) + token["data"] = "<%s%s>" % (token["name"],attrs) + else: + token["data"] = "<%s>" % token["name"] + if token.get("selfClosing"): + token["data"]=token["data"][:-1] + "/>" + + if token["type"] in list(tokenTypes.keys()): + token["type"] = "Characters" + else: + token["type"] = tokenTypes["Characters"] + + del token["name"] + return token + def sanitize_css(self, style): # disallow urls style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) From 6681550d9b0721739921c3b25e8dd4bdcb9ed99e Mon Sep 17 00:00:00 2001 From: Andreas Madsack Date: Wed, 24 Apr 2013 15:11:31 +0200 Subject: [PATCH 2/3] added missing parameter for token_type --- html5lib/sanitizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 7d2dc482..8b5a1d6f 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -168,15 +168,15 @@ def sanitize_token(self, token): if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"]): if token["name"] in self.allowed_elements: - return self.allowed_token(token) + return self.allowed_token(token, token_type) else: - return self.unallowed_token(token) + return self.unallowed_token(token, token_type) elif token_type == tokenTypes["Comment"]: pass else: return token - def allowed_token(self, token): + def allowed_token(self, token, token_type): if "data" in token: attrs = dict([(name,val) for name,val in token["data"][::-1] @@ -206,7 +206,7 @@ def allowed_token(self, token): token["data"] = [[name,val] for name,val in list(attrs.items())] return token - def unallowed_token(self, token): + def unallowed_token(self, token, token_type): if token_type == tokenTypes["EndTag"]: token["data"] = "" % token["name"] elif token["data"]: From 9194874b09fff7b58d8408dbe19b79ca73c03459 Mon Sep 17 00:00:00 2001 From: Andreas Madsack Date: Wed, 24 Apr 2013 15:12:04 +0200 Subject: [PATCH 3/3] changed unallow to disallow --- html5lib/sanitizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py index 8b5a1d6f..d9350952 100644 --- a/html5lib/sanitizer.py +++ b/html5lib/sanitizer.py @@ -170,7 +170,7 @@ def sanitize_token(self, token): if token["name"] in self.allowed_elements: return self.allowed_token(token, token_type) else: - return self.unallowed_token(token, token_type) + return self.disallowed_token(token, token_type) elif token_type == tokenTypes["Comment"]: pass else: @@ -206,7 +206,7 @@ def allowed_token(self, token, token_type): token["data"] = [[name,val] for name,val in list(attrs.items())] return token - def unallowed_token(self, token, token_type): + def disallowed_token(self, token, token_type): if token_type == tokenTypes["EndTag"]: token["data"] = "" % token["name"] elif token["data"]: