diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py index 8884696d..3ec63d72 100644 --- a/html5lib/filters/lint.py +++ b/html5lib/filters/lint.py @@ -1,90 +1,77 @@ from __future__ import absolute_import, division, unicode_literals +from six import text_type + from . import _base -from ..constants import cdataElements, rcdataElements, voidElements +from ..constants import namespaces, voidElements from ..constants import spaceCharacters spaceCharacters = "".join(spaceCharacters) -class LintError(Exception): - pass - - class Filter(_base.Filter): def __iter__(self): open_elements = [] - contentModelFlag = "PCDATA" for token in _base.Filter.__iter__(self): type = token["type"] if type in ("StartTag", "EmptyTag"): + namespace = token["namespace"] name = token["name"] - if contentModelFlag != "PCDATA": - raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name}) - if not isinstance(name, str): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) - if not name: - raise LintError("Empty tag name") - if type == "StartTag" and name in voidElements: - raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name}) - elif type == "EmptyTag" and name not in voidElements: - raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" + assert isinstance(token["data"], dict) + if (not namespace or namespace == namespaces["html"]) and name in voidElements: + assert type == "EmptyTag" + else: + assert type == "StartTag" if type == "StartTag": - open_elements.append(name) - for name, value in token["data"]: - if not isinstance(name, str): - raise LintError("Attribute name is not a string: %(name)r" % {"name": name}) - if not name: - raise LintError("Empty attribute name") - if not isinstance(value, str): - raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) - if name in cdataElements: - contentModelFlag = "CDATA" - elif name in rcdataElements: - contentModelFlag = "RCDATA" - elif name == "plaintext": - contentModelFlag = "PLAINTEXT" + open_elements.append((namespace, name)) + for (namespace, name), value in token["data"].items(): + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" + assert isinstance(value, text_type) elif type == "EndTag": + namespace = token["namespace"] name = token["name"] - if not isinstance(name, str): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) - if not name: - raise LintError("Empty tag name") - if name in voidElements: - raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name}) - start_name = open_elements.pop() - if start_name != name: - raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) - contentModelFlag = "PCDATA" + assert namespace is None or isinstance(namespace, text_type) + assert namespace != "" + assert isinstance(name, text_type) + assert name != "" + if (not namespace or namespace == namespaces["html"]) and name in voidElements: + assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name} + else: + start = open_elements.pop() + assert start == (namespace, name) elif type == "Comment": - if contentModelFlag != "PCDATA": - raise LintError("Comment not in PCDATA content model flag") + data = token["data"] + assert isinstance(data, text_type) elif type in ("Characters", "SpaceCharacters"): data = token["data"] - if not isinstance(data, str): - raise LintError("Attribute name is not a string: %(name)r" % {"name": data}) - if not data: - raise LintError("%(type)s token with empty data" % {"type": type}) + assert isinstance(data, text_type) + assert data != "" if type == "SpaceCharacters": - data = data.strip(spaceCharacters) - if data: - raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data}) + assert data.strip(spaceCharacters) == "" elif type == "Doctype": name = token["name"] - if contentModelFlag != "PCDATA": - raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name}) - if not isinstance(name, str): - raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) - # XXX: what to do with token["data"] ? + assert name is None or isinstance(name, text_type) + assert token["publicId"] is None or isinstance(name, text_type) + assert token["systemId"] is None or isinstance(name, text_type) + + elif type == "Entity": + assert isinstance(token["name"], text_type) - elif type in ("ParseError", "SerializeError"): - pass + elif type == "SerializerError": + assert isinstance(token["data"], text_type) else: - raise LintError("Unknown token type: %(type)s" % {"type": type}) + assert False, "Unknown token type: %(type)s" % {"type": type} yield token diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index c79d0b1b..e59f25ea 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -14,6 +14,7 @@ from .support import get_data_files, TestData, convertExpected from html5lib import html5parser, treewalkers, treebuilders, treeadapters, constants +from html5lib.filters.lint import Filter as Lint treeTypes = { @@ -77,21 +78,21 @@ def test_all_tokens(self): expected = [ {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, {'data': 'a', 'type': 'Characters'}, {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, {'data': 'b', 'type': 'Characters'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'}, {'data': 'c', 'type': 'Characters'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, - {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'} + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, + {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'} ] for treeName, treeCls in sorted(treeTypes.items()): p = html5parser.HTMLParser(tree=treeCls["builder"]) document = p.parse("a
b
c") document = treeCls.get("adapter", lambda x: x)(document) - output = treeCls["walker"](document) + output = Lint(treeCls["walker"](document)) for expectedToken, outputToken in zip(expected, output): self.assertEqual(expectedToken, outputToken) @@ -111,7 +112,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass): document = treeClass.get("adapter", lambda x: x)(document) try: - output = treewalkers.pprint(treeClass["walker"](document)) + output = treewalkers.pprint(Lint(treeClass["walker"](document))) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()], diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index e79a4357..36e1ba24 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -1,8 +1,7 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type, string_types from xml.dom import Node -from ..constants import voidElements, spaceCharacters +from ..constants import namespaces, voidElements, spaceCharacters __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", "TreeWalker", "NonRecursiveTreeWalker"] @@ -18,24 +17,6 @@ spaceCharacters = "".join(spaceCharacters) -def to_text(s, blank_if_none=True): - """Wrapper around six.text_type to convert None to empty string""" - if s is None: - if blank_if_none: - return "" - else: - return None - elif isinstance(s, text_type): - return s - else: - return text_type(s) - - -def is_text_or_none(string): - """Wrapper around isinstance(string_types) or is None""" - return string is None or isinstance(string, string_types) - - class TreeWalker(object): def __init__(self, tree): self.tree = tree @@ -47,47 +28,25 @@ def error(self, msg): return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(name) - assert all((namespace is None or isinstance(namespace, string_types)) and - isinstance(name, string_types) and - isinstance(value, string_types) - for (namespace, name), value in attrs.items()) - - yield {"type": "EmptyTag", "name": to_text(name, False), - "namespace": to_text(namespace), + yield {"type": "EmptyTag", "name": name, + "namespace": namespace, "data": attrs} if hasChildren: yield self.error("Void element has children") def startTag(self, namespace, name, attrs): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(name) - assert all((namespace is None or isinstance(namespace, string_types)) and - isinstance(name, string_types) and - isinstance(value, string_types) - for (namespace, name), value in attrs.items()) - return {"type": "StartTag", - "name": text_type(name), - "namespace": to_text(namespace), - "data": dict(((to_text(namespace, False), to_text(name)), - to_text(value, False)) - for (namespace, name), value in attrs.items())} + "name": name, + "namespace": namespace, + "data": attrs} def endTag(self, namespace, name): - assert namespace is None or isinstance(namespace, string_types), type(namespace) - assert isinstance(name, string_types), type(namespace) - return {"type": "EndTag", - "name": to_text(name, False), - "namespace": to_text(namespace), - "data": {}} + "name": name, + "namespace": namespace} def text(self, data): - assert isinstance(data, string_types), type(data) - - data = to_text(data) + data = data middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] if left: @@ -101,25 +60,16 @@ def text(self, data): yield {"type": "SpaceCharacters", "data": right} def comment(self, data): - assert isinstance(data, string_types), type(data) - - return {"type": "Comment", "data": text_type(data)} - - def doctype(self, name, publicId=None, systemId=None, correct=True): - assert is_text_or_none(name), type(name) - assert is_text_or_none(publicId), type(publicId) - assert is_text_or_none(systemId), type(systemId) + return {"type": "Comment", "data": data} + def doctype(self, name, publicId=None, systemId=None): return {"type": "Doctype", - "name": to_text(name), - "publicId": to_text(publicId), - "systemId": to_text(systemId), - "correct": to_text(correct)} + "name": name, + "publicId": publicId, + "systemId": systemId} def entity(self, name): - assert isinstance(name, string_types), type(name) - - return {"type": "Entity", "name": text_type(name)} + return {"type": "Entity", "name": name} def unknown(self, nodeType): return self.error("Unknown node type: " + nodeType) @@ -154,7 +104,7 @@ def __iter__(self): elif type == ELEMENT: namespace, name, attributes, hasChildren = details - if name in voidElements: + if (not namespace or namespace == namespaces["html"]) and name in voidElements: for token in self.emptyTag(namespace, name, attributes, hasChildren): yield token @@ -187,7 +137,7 @@ def __iter__(self): type, details = details[0], details[1:] if type == ELEMENT: namespace, name, attributes, hasChildren = details - if name not in voidElements: + if (namespace and namespace != namespaces["html"]) or name not in voidElements: yield self.endTag(namespace, name) if self.tree is currentNode: currentNode = None diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index 24d33282..83cd1654 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -48,7 +48,7 @@ def tokens(self, event, next): elif kind == END: name = data.localname namespace = data.namespace - if name not in voidElements: + if namespace != namespaces["html"] or name not in voidElements: yield self.endTag(namespace, name) elif kind == COMMENT: diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index 90e116d3..173fa082 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -118,8 +118,10 @@ def __len__(self): class TreeWalker(_base.NonRecursiveTreeWalker): def __init__(self, tree): if hasattr(tree, "getroot"): + self.fragmentChildren = set() tree = Root(tree) elif isinstance(tree, list): + self.fragmentChildren = set(tree) tree = FragmentRoot(tree) _base.NonRecursiveTreeWalker.__init__(self, tree) self.filter = ihatexml.InfosetFilter() @@ -137,7 +139,7 @@ def getNodeDetails(self, node): return _base.DOCTYPE, node.name, node.public_id, node.system_id elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"): - return _base.TEXT, node.obj + return _base.TEXT, ensure_str(node.obj) elif node.tag == etree.Comment: return _base.COMMENT, ensure_str(node.text) @@ -197,5 +199,7 @@ def getParentNode(self, node): if key == "text": return node # else: fallback to "normal" processing + elif node in self.fragmentChildren: + return None return node.getparent()