diff --git a/CHANGES.rst b/CHANGES.rst index 62a6a233..1b557816 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -33,6 +33,13 @@ Released on XXX * **Use scripting disabled by default (as we don't implement scripting).** +* **Fix #11, avoiding the XSS bug potentially caused by serializer + allowing attribute values to be escaped out of in old browser versions, + changing the quote_attr_values option on serializer to take one of + three values, "always" (the old True value), "legacy" (the new option, + and the new default), and "spec" (the old False value, and the old + default).** + 0.9999999/1.0b8 ~~~~~~~~~~~~~~~ diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index 3ec63d72..af231d8e 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -10,6 +10,10 @@ class Filter(_base.Filter): + def __init__(self, source, require_matching_tags=True): + super(Filter, self).__init__(source) + self.require_matching_tags = require_matching_tags + def __iter__(self): open_elements = [] for token in _base.Filter.__iter__(self): @@ -26,7 +30,7 @@ def __iter__(self): assert type == "EmptyTag" else: assert type == "StartTag" - if type == "StartTag": + if type == "StartTag" and self.require_matching_tags: open_elements.append((namespace, name)) for (namespace, name), value in token["data"].items(): assert namespace is None or isinstance(namespace, text_type) @@ -44,7 +48,7 @@ def __iter__(self): assert name != "" if (not namespace or namespace == namespaces["html"]) and name in voidElements: assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name} - else: + elif self.require_matching_tags: start = open_elements.pop() assert start == (namespace, name) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index b87d9a75..afe2e0e2 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -1,10 +1,7 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type -try: - from functools import reduce -except ImportError: - pass +import re from ..constants import voidElements, booleanAttributes, spaceCharacters from ..constants import rcdataElements, entities, xmlEntities @@ -13,6 +10,17 @@ spaceCharacters = "".join(spaceCharacters) +quoteAttributeSpecChars = spaceCharacters + "\"'=<>`" +quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]") +quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars + + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" + "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" + "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" + "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" + "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" + "\u3000]") + try: from codecs import register_error, xmlcharrefreplace_errors except ImportError: @@ -73,7 +81,7 @@ def htmlentityreplace_errors(exc): class HTMLSerializer(object): # attribute quoting options - quote_attr_values = False + quote_attr_values = "legacy" # be secure by default quote_char = '"' use_best_quote_char = True @@ -109,9 +117,9 @@ def __init__(self, **kwargs): inject_meta_charset=True|False Whether it insert a meta element to define the character set of the document. - quote_attr_values=True|False + quote_attr_values="legacy"|"spec"|"always" Whether to quote attribute values that don't require quoting - per HTML5 parsing rules. + per legacy browser behaviour, when required by the standard, or always. quote_char=u'"'|u"'" Use given quote character for attribute quoting. Default is to use double quote unless attribute value contains a double quote, @@ -240,11 +248,15 @@ def serialize(self, treewalker, encoding=None): (k not in booleanAttributes.get(name, tuple()) and k not in booleanAttributes.get("", tuple())): yield self.encodeStrict("=") - if self.quote_attr_values or not v: + if self.quote_attr_values == "always" or len(v) == 0: quote_attr = True + elif self.quote_attr_values == "spec": + quote_attr = quoteAttributeSpec.search(v) is not None + elif self.quote_attr_values == "legacy": + quote_attr = quoteAttributeLegacy.search(v) is not None else: - quote_attr = reduce(lambda x, y: x or (y in v), - spaceCharacters + ">\"'=", False) + raise ValueError("quote_attr_values must be one of: " + "'always', 'spec', or 'legacy'") v = v.replace("&", "&") if self.escape_lt_in_attrs: v = v.replace("<", "<") diff --git a/html5lib/tests/serializer-testdata/core.test b/html5lib/tests/serializer-testdata/core.test new file mode 100644 index 00000000..70828d0d --- /dev/null +++ b/html5lib/tests/serializer-testdata/core.test @@ -0,0 +1,395 @@ +{ + "tests": [ + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "test \"with\" "" + } + ] + ] + ], + "description": "proper attribute value escaping" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo" + } + ] + ] + ], + "description": "proper attribute value non-quoting" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo=bar" + } + ] + ] + ], + "description": "proper attribute value quoting (with =)" + }, + { + "expected": [ + "bar\">" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo>bar" + } + ] + ] + ], + "description": "proper attribute value quoting (with >)" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo\"bar" + } + ] + ] + ], + "description": "proper attribute value quoting (with \")" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo'bar" + } + ] + ] + ], + "description": "proper attribute value quoting (with ')" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo'bar\"baz" + } + ] + ] + ], + "description": "proper attribute value quoting (with both \" and ')" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo bar" + } + ] + ] + ], + "description": "proper attribute value quoting (with space)" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo\tbar" + } + ] + ] + ], + "description": "proper attribute value quoting (with tab)" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo\nbar" + } + ] + ] + ], + "description": "proper attribute value quoting (with LF)" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo\rbar" + } + ] + ] + ], + "description": "proper attribute value quoting (with CR)" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo\u000bbar" + } + ] + ] + ], + "description": "proper attribute value non-quoting (with linetab)" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "span", + [ + { + "namespace": null, + "name": "title", + "value": "foo\fbar" + } + ] + ] + ], + "description": "proper attribute value quoting (with form feed)" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "EmptyTag", + "img", + {} + ] + ], + "description": "void element (as EmptyTag token)" + }, + { + "expected": [ + "" + ], + "input": [ + [ + "Doctype", + "foo" + ] + ], + "description": "doctype in error" + }, + { + "expected": [ + "a<b>c&d" + ], + "input": [ + [ + "Characters", + "ac&d" + ] + ], + "description": "character data", + "options": { + "encoding": "utf-8" + } + }, + { + "expected": [ + "" + ], + "input": [ + [ + "StartTag", + "http://www.w3.org/1999/xhtml", + "script", + {} + ], + [ + "Characters", + "\t\r\n\f foo \t\r\n\f bar \t\r\n\f" + ], + [ + "EndTag", + "http://www.w3.org/1999/xhtml", + "script" + ] + ], + "description": "text within