From 40c3ba67f3a8573f64c3fec9a2aec9d7ab8b8f8d Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 01:30:55 +0000 Subject: [PATCH 01/16] Fix lint to expect text_type everywhere --- html5lib/filters/lint.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index 8884696d..9eee9cc5 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -1,5 +1,7 @@ from __future__ import absolute_import, division, unicode_literals +from six import text_type + from . import _base from ..constants import cdataElements, rcdataElements, voidElements @@ -21,7 +23,7 @@ def __iter__(self): name = token["name"] if contentModelFlag != "PCDATA": raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name}) - if not isinstance(name, str): + if not isinstance(name, text_type): raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: raise LintError("Empty tag name") @@ -32,11 +34,11 @@ def __iter__(self): if type == "StartTag": open_elements.append(name) for name, value in token["data"]: - if not isinstance(name, str): + if not isinstance(name, text_type): raise LintError("Attribute name is not a string: %(name)r" % {"name": name}) if not name: raise LintError("Empty attribute name") - if not isinstance(value, str): + if not isinstance(value, text_type): raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) if name in cdataElements: contentModelFlag = "CDATA" @@ -47,7 +49,7 @@ def __iter__(self): elif type == "EndTag": name = token["name"] - if not isinstance(name, str): + if not isinstance(name, text_type): raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: raise LintError("Empty tag name") @@ -64,7 +66,7 @@ def __iter__(self): elif type in ("Characters", "SpaceCharacters"): data = token["data"] - if not isinstance(data, str): + if not isinstance(data, text_type): raise LintError("Attribute name is not a string: %(name)r" % {"name": data}) if not data: raise LintError("%(type)s token with empty data" % {"type": type}) @@ -77,7 +79,7 @@ def __iter__(self): name = token["name"] if contentModelFlag != "PCDATA": raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name}) - if not isinstance(name, str): + if not isinstance(name, text_type): raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) # XXX: what to do with token["data"] ? From fbbea1f614aaf69943c82271a37ec78623c362f7 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 01:34:28 +0000 Subject: [PATCH 02/16] Update lint filter for namespaced attributes --- html5lib/filters/lint.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index 9eee9cc5..74cdc859 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -33,11 +33,15 @@ def __iter__(self): raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) if type == "StartTag": open_elements.append(name) - for name, value in token["data"]: - if not isinstance(name, text_type): - raise LintError("Attribute name is not a string: %(name)r" % {"name": name}) - if not name: - raise LintError("Empty attribute name") + for (namespace, localname), value in token["data"].items(): + if namespace is not None and not isinstance(namespace, text_type): + raise LintError("Attribute namespace is not a string or None: %(name)r" % {"name": namespace}) + if namespace == "": + raise LintError("Empty attribute namespace") + if not isinstance(localname, text_type): + raise LintError("Attribute localname is not a string: %(name)r" % {"name": localname}) + if not localname: + raise LintError("Empty attribute localname") if not isinstance(value, text_type): raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) if name in cdataElements: From 8b4d7c45b3715a3ae22ef543ec5cdfe5c742792e Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 01:36:22 +0000 Subject: [PATCH 03/16] Drop the content model requirements from lint --- html5lib/filters/lint.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index 74cdc859..fc7c1ebe 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -3,7 +3,7 @@ from six import text_type from . import _base -from ..constants import cdataElements, rcdataElements, voidElements +from ..constants import voidElements from ..constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) @@ -16,13 +16,10 @@ class LintError(Exception): class Filter(_base.Filter): def __iter__(self): open_elements = [] - contentModelFlag = "PCDATA" for token in _base.Filter.__iter__(self): type = token["type"] if type in ("StartTag", "EmptyTag"): name = token["name"] - if contentModelFlag != "PCDATA": - raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name}) if not isinstance(name, text_type): raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: @@ -44,12 +41,6 @@ def __iter__(self): raise LintError("Empty attribute localname") if not isinstance(value, text_type): raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) - if name in cdataElements: - contentModelFlag = "CDATA" - elif name in rcdataElements: - contentModelFlag = "RCDATA" - elif name == "plaintext": - contentModelFlag = "PLAINTEXT" elif type == "EndTag": name = token["name"] @@ -62,11 +53,9 @@ def __iter__(self): start_name = open_elements.pop() if start_name != name: raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) - contentModelFlag = "PCDATA" elif type == "Comment": - if contentModelFlag != "PCDATA": - raise LintError("Comment not in PCDATA content model flag") + pass elif type in ("Characters", "SpaceCharacters"): data = token["data"] @@ -81,8 +70,6 @@ def __iter__(self): elif type == "Doctype": name = token["name"] - if contentModelFlag != "PCDATA": - raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name}) if not isinstance(name, text_type): raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) # XXX: what to do with token["data"] ? From 270a2ca14fafc989f8f1bd4f79db2f4bd9f4d1fc Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:05:55 +0000 Subject: [PATCH 04/16] Don't let the lxml treewalker walk above the fragment root --- html5lib/treewalkers/lxmletree.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index 90e116d3..5c258a86 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -118,8 +118,10 @@ def __len__(self): class TreeWalker(_base.NonRecursiveTreeWalker): def __init__(self, tree): if hasattr(tree, "getroot"): + self.fragmentChildren = set() tree = Root(tree) elif isinstance(tree, list): + self.fragmentChildren = set(tree) tree = FragmentRoot(tree) _base.NonRecursiveTreeWalker.__init__(self, tree) self.filter = ihatexml.InfosetFilter() @@ -197,5 +199,7 @@ def getParentNode(self, node): if key == "text": return node # else: fallback to "normal" processing + elif node in self.fragmentChildren: + return None return node.getparent() From 66ef02658ba79d5cffc65d71468da3b3d0b6398e Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:22:22 +0000 Subject: [PATCH 05/16] Teach lint & treewalkers that elements are only void in HTML ns --- html5lib/filters/lint.py | 22 ++++++++++++++++------ html5lib/treewalkers/_base.py | 6 +++--- html5lib/treewalkers/genshistream.py | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index fc7c1ebe..cc3e4ac4 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -3,7 +3,7 @@ from six import text_type from . import _base -from ..constants import voidElements +from ..constants import namespaces, voidElements from ..constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) @@ -19,17 +19,22 @@ def __iter__(self): for token in _base.Filter.__iter__(self): type = token["type"] if type in ("StartTag", "EmptyTag"): + namespace = token["namespace"] name = token["name"] + if namespace is not None and not isinstance(namespace, text_type): + raise LintError("Tag namespace is not a string or None: %(name)r" % {"name": namespace}) + if namespace == "": + raise LintError("Empty tag namespace") if not isinstance(name, text_type): raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: raise LintError("Empty tag name") - if type == "StartTag" and name in voidElements: + if type == "StartTag" and (not namespace or namespace == namespaces["html"]) and name in voidElements: raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name}) - elif type == "EmptyTag" and name not in voidElements: + elif type == "EmptyTag" and (not namespace or namespace == namespaces["html"]) and name not in voidElements: raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) if type == "StartTag": - open_elements.append(name) + open_elements.append((namespace, name)) for (namespace, localname), value in token["data"].items(): if namespace is not None and not isinstance(namespace, text_type): raise LintError("Attribute namespace is not a string or None: %(name)r" % {"name": namespace}) @@ -43,15 +48,20 @@ def __iter__(self): raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) elif type == "EndTag": + namespace = token["namespace"] name = token["name"] + if namespace is not None and not isinstance(namespace, text_type): + raise LintError("Tag namespace is not a string or None: %(name)r" % {"name": namespace}) + if namespace == "": + raise LintError("Empty tag namespace") if not isinstance(name, text_type): raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: raise LintError("Empty tag name") - if name in voidElements: + if (not namespace or namespace == namespaces["html"]) and name in voidElements: raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name}) start_name = open_elements.pop() - if start_name != name: + if start_name != (namespace, name): raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) elif type == "Comment": diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index e79a4357..271f45a0 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -2,7 +2,7 @@ from six import text_type, string_types from xml.dom import Node -from ..constants import voidElements, spaceCharacters +from ..constants import namespaces, voidElements, spaceCharacters __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", "TreeWalker", "NonRecursiveTreeWalker"] @@ -154,7 +154,7 @@ def __iter__(self): elif type == ELEMENT: namespace, name, attributes, hasChildren = details - if name in voidElements: + if (not namespace or namespace == namespaces["html"]) and name in voidElements: for token in self.emptyTag(namespace, name, attributes, hasChildren): yield token @@ -187,7 +187,7 @@ def __iter__(self): type, details = details[0], details[1:] if type == ELEMENT: namespace, name, attributes, hasChildren = details - if name not in voidElements: + if (namespace and namespace != namespaces["html"]) or name not in voidElements: yield self.endTag(namespace, name) if self.tree is currentNode: currentNode = None diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index 24d33282..83cd1654 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -48,7 +48,7 @@ def tokens(self, event, next): elif kind == END: name = data.localname namespace = data.namespace - if name not in voidElements: + if namespace != namespaces["html"] or name not in voidElements: yield self.endTag(namespace, name) elif kind == COMMENT: From 5bd341350b22a78295c9b2883b568774d15fadef Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:26:44 +0000 Subject: [PATCH 06/16] Use lint filter to ensure validity of treewalkers --- html5lib/tests/test_treewalkers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index c79d0b1b..04a6cae4 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -14,6 +14,7 @@ from .support import get_data_files, TestData, convertExpected from html5lib import html5parser, treewalkers, treebuilders, treeadapters, constants +from html5lib.filters.lint import Filter as Lint treeTypes = { @@ -91,7 +92,7 @@ def test_all_tokens(self): p = html5parser.HTMLParser(tree=treeCls["builder"]) document = p.parse("a
b
c") document = treeCls.get("adapter", lambda x: x)(document) - output = treeCls["walker"](document) + output = Lint(treeCls["walker"](document)) for expectedToken, outputToken in zip(expected, output): self.assertEqual(expectedToken, outputToken) @@ -111,7 +112,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): document = treeClass.get("adapter", lambda x: x)(document) try: - output = treewalkers.pprint(treeClass["walker"](document)) + output = treewalkers.pprint(Lint(treeClass["walker"](document))) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()], From fb9e1776a565ca157c33e4301891a58dee4337c4 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:27:56 +0000 Subject: [PATCH 07/16] Remove runtime type checks from treewalkers._base --- html5lib/treewalkers/_base.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index 271f45a0..dd6823dd 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -31,11 +31,6 @@ def to_text(s, blank_if_none=True): return text_type(s) -def is_text_or_none(string): - """Wrapper around isinstance(string_types) or is None""" - return string is None or isinstance(string, string_types) - - class TreeWalker(object): def __init__(self, tree): self.tree = tree @@ -47,13 +42,6 @@ def error(self, msg): return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(name) - assert all((namespace is None or isinstance(namespace, string_types)) and - isinstance(name, string_types) and - isinstance(value, string_types) - for (namespace, name), value in attrs.items()) - yield {"type": "EmptyTag", "name": to_text(name, False), "namespace": to_text(namespace), "data": attrs} @@ -61,13 +49,6 @@ def emptyTag(self, namespace, name, attrs, hasChildren=False): yield self.error("Void element has children") def startTag(self, namespace, name, attrs): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(name) - assert all((namespace is None or isinstance(namespace, string_types)) and - isinstance(name, string_types) and - isinstance(value, string_types) - for (namespace, name), value in attrs.items()) - return {"type": "StartTag", "name": text_type(name), "namespace": to_text(namespace), @@ -76,17 +57,12 @@ def startTag(self, namespace, name, attrs): for (namespace, name), value in attrs.items())} def endTag(self, namespace, name): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(namespace) - return {"type": "EndTag", "name": to_text(name, False), "namespace": to_text(namespace), "data": {}} def text(self, data): - assert isinstance(data, string_types), type(data) - data = to_text(data) middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] @@ -101,15 +77,9 @@ def text(self, data): yield {"type": "SpaceCharacters", "data": right} def comment(self, data): - assert isinstance(data, string_types), type(data) - return {"type": "Comment", "data": text_type(data)} def doctype(self, name, publicId=None, systemId=None, correct=True): - assert is_text_or_none(name), type(name) - assert is_text_or_none(publicId), type(publicId) - assert is_text_or_none(systemId), type(systemId) - return {"type": "Doctype", "name": to_text(name), "publicId": to_text(publicId), @@ -117,8 +87,6 @@ def doctype(self, name, publicId=None, systemId=None, correct=True): "correct": to_text(correct)} def entity(self, name): - assert isinstance(name, string_types), type(name) - return {"type": "Entity", "name": text_type(name)} def unknown(self, nodeType): From 2a5d7af11230225200cdaf101bb36980a8fd3f8e Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:34:39 +0000 Subject: [PATCH 08/16] Make sure we have the unicode from of text in lxml fragment root --- html5lib/treewalkers/lxmletree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index 5c258a86..173fa082 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -139,7 +139,7 @@ def getNodeDetails(self, node): return _base.DOCTYPE, node.name, node.public_id, node.system_id elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"): - return _base.TEXT, node.obj + return _base.TEXT, ensure_str(node.obj) elif node.tag == etree.Comment: return _base.COMMENT, ensure_str(node.text) From 9eff304ce8a230ecfe84a4c4fcb61b887bfcc551 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:35:13 +0000 Subject: [PATCH 09/16] Allow None as a doctype tagname in lint --- html5lib/filters/lint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index cc3e4ac4..9f99a876 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -80,8 +80,8 @@ def __iter__(self): elif type == "Doctype": name = token["name"] - if not isinstance(name, text_type): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) + if name is not None and not isinstance(name, text_type): + raise LintError("Tag name is not a string or None: %(tag)r" % {"tag": name}) # XXX: what to do with token["data"] ? elif type in ("ParseError", "SerializeError"): From e0ea89948b80a300825b039fcfcda8ec4a13d513 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:39:38 +0000 Subject: [PATCH 10/16] Drop all the to_text magic in treewalkers._base --- html5lib/treewalkers/_base.py | 44 +++++++++++------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index dd6823dd..6d0faef1 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -1,5 +1,4 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type, string_types from xml.dom import Node from ..constants import namespaces, voidElements, spaceCharacters @@ -18,19 +17,6 @@ spaceCharacters = "".join(spaceCharacters) -def to_text(s, blank_if_none=True): - """Wrapper around six.text_type to convert None to empty string""" - if s is None: - if blank_if_none: - return "" - else: - return None - elif isinstance(s, text_type): - return s - else: - return text_type(s) - - class TreeWalker(object): def __init__(self, tree): self.tree = tree @@ -42,28 +28,26 @@ def error(self, msg): return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): - yield {"type": "EmptyTag", "name": to_text(name, False), - "namespace": to_text(namespace), + yield {"type": "EmptyTag", "name": name, + "namespace": namespace, "data": attrs} if hasChildren: yield self.error("Void element has children") def startTag(self, namespace, name, attrs): return {"type": "StartTag", - "name": text_type(name), - "namespace": to_text(namespace), - "data": dict(((to_text(namespace, False), to_text(name)), - to_text(value, False)) - for (namespace, name), value in attrs.items())} + "name": name, + "namespace": namespace, + "data": attrs} def endTag(self, namespace, name): return {"type": "EndTag", - "name": to_text(name, False), - "namespace": to_text(namespace), + "name": name, + "namespace": namespace, "data": {}} def text(self, data): - data = to_text(data) + data = data middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] if left: @@ -77,17 +61,17 @@ def text(self, data): yield {"type": "SpaceCharacters", "data": right} def comment(self, data): - return {"type": "Comment", "data": text_type(data)} + return {"type": "Comment", "data": data} def doctype(self, name, publicId=None, systemId=None, correct=True): return {"type": "Doctype", - "name": to_text(name), - "publicId": to_text(publicId), - "systemId": to_text(systemId), - "correct": to_text(correct)} + "name": name, + "publicId": publicId, + "systemId": systemId, + "correct": correct} def entity(self, name): - return {"type": "Entity", "name": text_type(name)} + return {"type": "Entity", "name": name} def unknown(self, nodeType): return self.error("Unknown node type: " + nodeType) From 22c2b1ac0fc9eb73aefde898f7b9c948e34dc041 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:52:28 +0000 Subject: [PATCH 11/16] Get rid of LintError and just use asserts All of these properties should always hold per the API, so asserts seem like a good match here. --- html5lib/filters/lint.py | 77 +++++++++++++++------------------------- 1 file changed, 28 insertions(+), 49 deletions(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index 9f99a876..e2434ef4 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -9,10 +9,6 @@ spaceCharacters = "".join(spaceCharacters) -class LintError(Exception): - pass - - class Filter(_base.Filter): def __iter__(self): open_elements = [] @@ -21,73 +17,56 @@ def __iter__(self): if type in ("StartTag", "EmptyTag"): namespace = token["namespace"] name = token["name"] - if namespace is not None and not isinstance(namespace, text_type): - raise LintError("Tag namespace is not a string or None: %(name)r" % {"name": namespace}) - if namespace == "": - raise LintError("Empty tag namespace") - if not isinstance(name, text_type): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) - if not name: - raise LintError("Empty tag name") - if type == "StartTag" and (not namespace or namespace == namespaces["html"]) and name in voidElements: - raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name}) - elif type == "EmptyTag" and (not namespace or namespace == namespaces["html"]) and name not in voidElements: - raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" + assert isinstance(token["data"], dict) + if (not namespace or namespace == namespaces["html"]) and name in voidElements: + assert type == "EmptyTag" + else: + assert type == "StartTag" if type == "StartTag": open_elements.append((namespace, name)) - for (namespace, localname), value in token["data"].items(): - if namespace is not None and not isinstance(namespace, text_type): - raise LintError("Attribute namespace is not a string or None: %(name)r" % {"name": namespace}) - if namespace == "": - raise LintError("Empty attribute namespace") - if not isinstance(localname, text_type): - raise LintError("Attribute localname is not a string: %(name)r" % {"name": localname}) - if not localname: - raise LintError("Empty attribute localname") - if not isinstance(value, text_type): - raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) + for (namespace, name), value in token["data"].items(): + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" + assert isinstance(value, text_type) elif type == "EndTag": namespace = token["namespace"] name = token["name"] - if namespace is not None and not isinstance(namespace, text_type): - raise LintError("Tag namespace is not a string or None: %(name)r" % {"name": namespace}) - if namespace == "": - raise LintError("Empty tag namespace") - if not isinstance(name, text_type): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) - if not name: - raise LintError("Empty tag name") + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" if (not namespace or namespace == namespaces["html"]) and name in voidElements: - raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name}) - start_name = open_elements.pop() - if start_name != (namespace, name): - raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) + assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name} + else: + start = open_elements.pop() + assert start == (namespace, name) elif type == "Comment": pass elif type in ("Characters", "SpaceCharacters"): data = token["data"] - if not isinstance(data, text_type): - raise LintError("Attribute name is not a string: %(name)r" % {"name": data}) - if not data: - raise LintError("%(type)s token with empty data" % {"type": type}) + assert isinstance(data, text_type) + assert data != "" if type == "SpaceCharacters": - data = data.strip(spaceCharacters) - if data: - raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data}) + assert data.strip(spaceCharacters) == "" elif type == "Doctype": name = token["name"] - if name is not None and not isinstance(name, text_type): - raise LintError("Tag name is not a string or None: %(tag)r" % {"tag": name}) + assert name is None or isinstance(name, text_type) # XXX: what to do with token["data"] ? elif type in ("ParseError", "SerializeError"): pass else: - raise LintError("Unknown token type: %(type)s" % {"type": type}) + assert False, "Unknown token type: %(type)s" % {"type": type} yield token From 5336ebea678f099f5def28ffe3924c41c6de782d Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:54:53 +0000 Subject: [PATCH 12/16] Lint that comments are text_type --- html5lib/filters/lint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index e2434ef4..be51b852 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -49,7 +49,8 @@ def __iter__(self): assert start == (namespace, name) elif type == "Comment": - pass + data = token["data"] + assert isinstance(data, text_type) elif type in ("Characters", "SpaceCharacters"): data = token["data"] From dc879ffaab0455e8974ceaac40b727e5a04c1175 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:55:06 +0000 Subject: [PATCH 13/16] Don't allow ParseError/SerializerError tokens, whatever they are! --- html5lib/filters/lint.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index be51b852..076dbc54 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -64,9 +64,6 @@ def __iter__(self): assert name is None or isinstance(name, text_type) # XXX: what to do with token["data"] ? - elif type in ("ParseError", "SerializeError"): - pass - else: assert False, "Unknown token type: %(type)s" % {"type": type} From 7f8bd13cc2d6e334d898c64afecf4b1bf64c5f93 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:55:32 +0000 Subject: [PATCH 14/16] Drop end tag tree walker's data (always empty now) --- html5lib/tests/test_treewalkers.py | 8 ++++---- html5lib/treewalkers/_base.py | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 04a6cae4..e59f25ea 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -78,15 +78,15 @@ def test_all_tokens(self): expected = [ {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, {'data': 'a', 'type': 'Characters'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, {'data': 'b', 'type': 'Characters'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, {'data': 'c', 'type': 'Characters'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'} + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'} ] for treeName, treeCls in sorted(treeTypes.items()): p = html5parser.HTMLParser(tree=treeCls["builder"]) diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index 6d0faef1..bf66ec71 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -43,8 +43,7 @@ def startTag(self, namespace, name, attrs): def endTag(self, namespace, name): return {"type": "EndTag", "name": name, - "namespace": namespace, - "data": {}} + "namespace": namespace} def text(self, data): data = data From c335295f6b9d0b0710b86d94f79494cc676deb70 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 02:57:59 +0000 Subject: [PATCH 15/16] Drop tree walker doctype correct flag, whatever that once was! --- html5lib/treewalkers/_base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index bf66ec71..36e1ba24 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -62,12 +62,11 @@ def text(self, data): def comment(self, data): return {"type": "Comment", "data": data} - def doctype(self, name, publicId=None, systemId=None, correct=True): + def doctype(self, name, publicId=None, systemId=None): return {"type": "Doctype", "name": name, "publicId": publicId, - "systemId": systemId, - "correct": correct} + "systemId": systemId} def entity(self, name): return {"type": "Entity", "name": name} From ca6591cca342065305949189f5adbc741f76fe9b Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 16 Dec 2015 03:55:12 +0000 Subject: [PATCH 16/16] Make sure lint is testing everything treewalkers can do. --- html5lib/filters/lint.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index 076dbc54..3ec63d72 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -62,7 +62,14 @@ def __iter__(self): elif type == "Doctype": name = token["name"] assert name is None or isinstance(name, text_type) - # XXX: what to do with token["data"] ? + assert token["publicId"] is None or isinstance(name, text_type) + assert token["systemId"] is None or isinstance(name, text_type) + + elif type == "Entity": + assert isinstance(token["name"], text_type) + + elif type == "SerializerError": + assert isinstance(token["data"], text_type) else: assert False, "Unknown token type: %(type)s" % {"type": type}