diff --git a/html5lib/sanitizer.py b/html5lib/sanitizer.py
index 9c7d342d..d9350952 100644
--- a/html5lib/sanitizer.py
+++ b/html5lib/sanitizer.py
@@ -168,57 +168,63 @@ def sanitize_token(self, token):
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements:
- if "data" in token:
- attrs = dict([(name,val) for name,val in
- token["data"][::-1]
- if name in self.allowed_attributes])
- for attr in self.attr_val_is_uri:
- if attr not in attrs:
- continue
- val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
- unescape(attrs[attr])).lower()
- #remove replacement characters from unescaped characters
- val_unescaped = val_unescaped.replace("\ufffd", "")
- if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
- (val_unescaped.split(':')[0] not in
- self.allowed_protocols)):
- del attrs[attr]
- for attr in self.svg_attr_val_allows_ref:
- if attr in attrs:
- attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
- ' ',
- unescape(attrs[attr]))
- if (token["name"] in self.svg_allow_local_href and
- 'xlink:href' in attrs and re.search('^\s*[^#\s].*',
- attrs['xlink:href'])):
- del attrs['xlink:href']
- if 'style' in attrs:
- attrs['style'] = self.sanitize_css(attrs['style'])
- token["data"] = [[name,val] for name,val in list(attrs.items())]
- return token
+ return self.allowed_token(token, token_type)
else:
- if token_type == tokenTypes["EndTag"]:
- token["data"] = "%s>" % token["name"]
- elif token["data"]:
- attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
- token["data"] = "<%s%s>" % (token["name"],attrs)
- else:
- token["data"] = "<%s>" % token["name"]
- if token.get("selfClosing"):
- token["data"]=token["data"][:-1] + "/>"
-
- if token["type"] in list(tokenTypes.keys()):
- token["type"] = "Characters"
- else:
- token["type"] = tokenTypes["Characters"]
-
- del token["name"]
- return token
+ return self.disallowed_token(token, token_type)
elif token_type == tokenTypes["Comment"]:
pass
else:
return token
+ def allowed_token(self, token, token_type):
+ if "data" in token:
+ attrs = dict([(name,val) for name,val in
+ token["data"][::-1]
+ if name in self.allowed_attributes])
+ for attr in self.attr_val_is_uri:
+ if attr not in attrs:
+ continue
+ val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+ unescape(attrs[attr])).lower()
+ #remove replacement characters from unescaped characters
+ val_unescaped = val_unescaped.replace("\ufffd", "")
+ if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
+ (val_unescaped.split(':')[0] not in
+ self.allowed_protocols)):
+ del attrs[attr]
+ for attr in self.svg_attr_val_allows_ref:
+ if attr in attrs:
+ attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+ ' ',
+ unescape(attrs[attr]))
+ if (token["name"] in self.svg_allow_local_href and
+ 'xlink:href' in attrs and re.search('^\s*[^#\s].*',
+ attrs['xlink:href'])):
+ del attrs['xlink:href']
+ if 'style' in attrs:
+ attrs['style'] = self.sanitize_css(attrs['style'])
+ token["data"] = [[name,val] for name,val in list(attrs.items())]
+ return token
+
+ def disallowed_token(self, token, token_type):
+ if token_type == tokenTypes["EndTag"]:
+ token["data"] = "%s>" % token["name"]
+ elif token["data"]:
+ attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
+ token["data"] = "<%s%s>" % (token["name"],attrs)
+ else:
+ token["data"] = "<%s>" % token["name"]
+ if token.get("selfClosing"):
+ token["data"]=token["data"][:-1] + "/>"
+
+ if token["type"] in list(tokenTypes.keys()):
+ token["type"] = "Characters"
+ else:
+ token["type"] = tokenTypes["Characters"]
+
+ del token["name"]
+ return token
+
def sanitize_css(self, style):
# disallow urls
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)