diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index 8884696d..3ec63d72 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -1,90 +1,77 @@
from __future__ import absolute_import, division, unicode_literals
+from six import text_type
+
from . import _base
-from ..constants import cdataElements, rcdataElements, voidElements
+from ..constants import namespaces, voidElements
from ..constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters)
-class LintError(Exception):
- pass
-
-
class Filter(_base.Filter):
def __iter__(self):
open_elements = []
- contentModelFlag = "PCDATA"
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
+ namespace = token["namespace"]
name = token["name"]
- if contentModelFlag != "PCDATA":
- raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
- if not isinstance(name, str):
- raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
- if not name:
- raise LintError("Empty tag name")
- if type == "StartTag" and name in voidElements:
- raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
- elif type == "EmptyTag" and name not in voidElements:
- raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
+ assert namespace is None or isinstance(namespace, text_type)
+ assert namespace != ""
+ assert isinstance(name, text_type)
+ assert name != ""
+ assert isinstance(token["data"], dict)
+ if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+ assert type == "EmptyTag"
+ else:
+ assert type == "StartTag"
if type == "StartTag":
- open_elements.append(name)
- for name, value in token["data"]:
- if not isinstance(name, str):
- raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
- if not name:
- raise LintError("Empty attribute name")
- if not isinstance(value, str):
- raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
- if name in cdataElements:
- contentModelFlag = "CDATA"
- elif name in rcdataElements:
- contentModelFlag = "RCDATA"
- elif name == "plaintext":
- contentModelFlag = "PLAINTEXT"
+ open_elements.append((namespace, name))
+ for (namespace, name), value in token["data"].items():
+ assert namespace is None or isinstance(namespace, text_type)
+ assert namespace != ""
+ assert isinstance(name, text_type)
+ assert name != ""
+ assert isinstance(value, text_type)
elif type == "EndTag":
+ namespace = token["namespace"]
name = token["name"]
- if not isinstance(name, str):
- raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
- if not name:
- raise LintError("Empty tag name")
- if name in voidElements:
- raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
- start_name = open_elements.pop()
- if start_name != name:
- raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
- contentModelFlag = "PCDATA"
+ assert namespace is None or isinstance(namespace, text_type)
+ assert namespace != ""
+ assert isinstance(name, text_type)
+ assert name != ""
+ if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+ assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
+ else:
+ start = open_elements.pop()
+ assert start == (namespace, name)
elif type == "Comment":
- if contentModelFlag != "PCDATA":
- raise LintError("Comment not in PCDATA content model flag")
+ data = token["data"]
+ assert isinstance(data, text_type)
elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
- if not isinstance(data, str):
- raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
- if not data:
- raise LintError("%(type)s token with empty data" % {"type": type})
+ assert isinstance(data, text_type)
+ assert data != ""
if type == "SpaceCharacters":
- data = data.strip(spaceCharacters)
- if data:
- raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
+ assert data.strip(spaceCharacters) == ""
elif type == "Doctype":
name = token["name"]
- if contentModelFlag != "PCDATA":
- raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
- if not isinstance(name, str):
- raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
- # XXX: what to do with token["data"] ?
+ assert name is None or isinstance(name, text_type)
+ assert token["publicId"] is None or isinstance(name, text_type)
+ assert token["systemId"] is None or isinstance(name, text_type)
+
+ elif type == "Entity":
+ assert isinstance(token["name"], text_type)
- elif type in ("ParseError", "SerializeError"):
- pass
+ elif type == "SerializerError":
+ assert isinstance(token["data"], text_type)
else:
- raise LintError("Unknown token type: %(type)s" % {"type": type})
+ assert False, "Unknown token type: %(type)s" % {"type": type}
yield token
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index c79d0b1b..e59f25ea 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -14,6 +14,7 @@
from .support import get_data_files, TestData, convertExpected
from html5lib import html5parser, treewalkers, treebuilders, treeadapters, constants
+from html5lib.filters.lint import Filter as Lint
treeTypes = {
@@ -77,21 +78,21 @@ def test_all_tokens(self):
expected = [
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
- {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
+ {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'data': 'a', 'type': 'Characters'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'data': 'b', 'type': 'Characters'},
- {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
+ {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'data': 'c', 'type': 'Characters'},
- {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
- {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
+ {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
+ {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
]
for treeName, treeCls in sorted(treeTypes.items()):
p = html5parser.HTMLParser(tree=treeCls["builder"])
document = p.parse("
ab
c")
document = treeCls.get("adapter", lambda x: x)(document)
- output = treeCls["walker"](document)
+ output = Lint(treeCls["walker"](document))
for expectedToken, outputToken in zip(expected, output):
self.assertEqual(expectedToken, outputToken)
@@ -111,7 +112,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
document = treeClass.get("adapter", lambda x: x)(document)
try:
- output = treewalkers.pprint(treeClass["walker"](document))
+ output = treewalkers.pprint(Lint(treeClass["walker"](document)))
output = attrlist.sub(sortattrs, output)
expected = attrlist.sub(sortattrs, convertExpected(expected))
diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index e79a4357..36e1ba24 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -1,8 +1,7 @@
from __future__ import absolute_import, division, unicode_literals
-from six import text_type, string_types
from xml.dom import Node
-from ..constants import voidElements, spaceCharacters
+from ..constants import namespaces, voidElements, spaceCharacters
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
"TreeWalker", "NonRecursiveTreeWalker"]
@@ -18,24 +17,6 @@
spaceCharacters = "".join(spaceCharacters)
-def to_text(s, blank_if_none=True):
- """Wrapper around six.text_type to convert None to empty string"""
- if s is None:
- if blank_if_none:
- return ""
- else:
- return None
- elif isinstance(s, text_type):
- return s
- else:
- return text_type(s)
-
-
-def is_text_or_none(string):
- """Wrapper around isinstance(string_types) or is None"""
- return string is None or isinstance(string, string_types)
-
-
class TreeWalker(object):
def __init__(self, tree):
self.tree = tree
@@ -47,47 +28,25 @@ def error(self, msg):
return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False):
- assert namespace is None or isinstance(namespace, string_types), type(namespace)
- assert isinstance(name, string_types), type(name)
- assert all((namespace is None or isinstance(namespace, string_types)) and
- isinstance(name, string_types) and
- isinstance(value, string_types)
- for (namespace, name), value in attrs.items())
-
- yield {"type": "EmptyTag", "name": to_text(name, False),
- "namespace": to_text(namespace),
+ yield {"type": "EmptyTag", "name": name,
+ "namespace": namespace,
"data": attrs}
if hasChildren:
yield self.error("Void element has children")
def startTag(self, namespace, name, attrs):
- assert namespace is None or isinstance(namespace, string_types), type(namespace)
- assert isinstance(name, string_types), type(name)
- assert all((namespace is None or isinstance(namespace, string_types)) and
- isinstance(name, string_types) and
- isinstance(value, string_types)
- for (namespace, name), value in attrs.items())
-
return {"type": "StartTag",
- "name": text_type(name),
- "namespace": to_text(namespace),
- "data": dict(((to_text(namespace, False), to_text(name)),
- to_text(value, False))
- for (namespace, name), value in attrs.items())}
+ "name": name,
+ "namespace": namespace,
+ "data": attrs}
def endTag(self, namespace, name):
- assert namespace is None or isinstance(namespace, string_types), type(namespace)
- assert isinstance(name, string_types), type(namespace)
-
return {"type": "EndTag",
- "name": to_text(name, False),
- "namespace": to_text(namespace),
- "data": {}}
+ "name": name,
+ "namespace": namespace}
def text(self, data):
- assert isinstance(data, string_types), type(data)
-
- data = to_text(data)
+ data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
if left:
@@ -101,25 +60,16 @@ def text(self, data):
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
- assert isinstance(data, string_types), type(data)
-
- return {"type": "Comment", "data": text_type(data)}
-
- def doctype(self, name, publicId=None, systemId=None, correct=True):
- assert is_text_or_none(name), type(name)
- assert is_text_or_none(publicId), type(publicId)
- assert is_text_or_none(systemId), type(systemId)
+ return {"type": "Comment", "data": data}
+ def doctype(self, name, publicId=None, systemId=None):
return {"type": "Doctype",
- "name": to_text(name),
- "publicId": to_text(publicId),
- "systemId": to_text(systemId),
- "correct": to_text(correct)}
+ "name": name,
+ "publicId": publicId,
+ "systemId": systemId}
def entity(self, name):
- assert isinstance(name, string_types), type(name)
-
- return {"type": "Entity", "name": text_type(name)}
+ return {"type": "Entity", "name": name}
def unknown(self, nodeType):
return self.error("Unknown node type: " + nodeType)
@@ -154,7 +104,7 @@ def __iter__(self):
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
- if name in voidElements:
+ if (not namespace or namespace == namespaces["html"]) and name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
@@ -187,7 +137,7 @@ def __iter__(self):
type, details = details[0], details[1:]
if type == ELEMENT:
namespace, name, attributes, hasChildren = details
- if name not in voidElements:
+ if (namespace and namespace != namespaces["html"]) or name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py
index 24d33282..83cd1654 100644
--- a/html5lib/treewalkers/genshistream.py
+++ b/html5lib/treewalkers/genshistream.py
@@ -48,7 +48,7 @@ def tokens(self, event, next):
elif kind == END:
name = data.localname
namespace = data.namespace
- if name not in voidElements:
+ if namespace != namespaces["html"] or name not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT:
diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py
index 90e116d3..173fa082 100644
--- a/html5lib/treewalkers/lxmletree.py
+++ b/html5lib/treewalkers/lxmletree.py
@@ -118,8 +118,10 @@ def __len__(self):
class TreeWalker(_base.NonRecursiveTreeWalker):
def __init__(self, tree):
if hasattr(tree, "getroot"):
+ self.fragmentChildren = set()
tree = Root(tree)
elif isinstance(tree, list):
+ self.fragmentChildren = set(tree)
tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = ihatexml.InfosetFilter()
@@ -137,7 +139,7 @@ def getNodeDetails(self, node):
return _base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
- return _base.TEXT, node.obj
+ return _base.TEXT, ensure_str(node.obj)
elif node.tag == etree.Comment:
return _base.COMMENT, ensure_str(node.text)
@@ -197,5 +199,7 @@ def getParentNode(self, node):
if key == "text":
return node
# else: fallback to "normal" processing
+ elif node in self.fragmentChildren:
+ return None
return node.getparent()