From 40c3ba67f3a8573f64c3fec9a2aec9d7ab8b8f8d Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 01:30:55 +0000
Subject: [PATCH 01/16] Fix lint to expect text_type everywhere

---
 html5lib/filters/lint.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index 8884696d..9eee9cc5 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -1,5 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals
 
+from six import text_type
+
 from . import _base
 from ..constants import cdataElements, rcdataElements, voidElements
 
@@ -21,7 +23,7 @@ def __iter__(self):
                 name = token["name"]
                 if contentModelFlag != "PCDATA":
                     raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
-                if not isinstance(name, str):
+                if not isinstance(name, text_type):
                     raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                 if not name:
                     raise LintError("Empty tag name")
@@ -32,11 +34,11 @@ def __iter__(self):
                 if type == "StartTag":
                     open_elements.append(name)
                 for name, value in token["data"]:
-                    if not isinstance(name, str):
+                    if not isinstance(name, text_type):
                         raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
                     if not name:
                         raise LintError("Empty attribute name")
-                    if not isinstance(value, str):
+                    if not isinstance(value, text_type):
                         raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
                 if name in cdataElements:
                     contentModelFlag = "CDATA"
@@ -47,7 +49,7 @@ def __iter__(self):
 
             elif type == "EndTag":
                 name = token["name"]
-                if not isinstance(name, str):
+                if not isinstance(name, text_type):
                     raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                 if not name:
                     raise LintError("Empty tag name")
@@ -64,7 +66,7 @@ def __iter__(self):
 
             elif type in ("Characters", "SpaceCharacters"):
                 data = token["data"]
-                if not isinstance(data, str):
+                if not isinstance(data, text_type):
                     raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
                 if not data:
                     raise LintError("%(type)s token with empty data" % {"type": type})
@@ -77,7 +79,7 @@ def __iter__(self):
                 name = token["name"]
                 if contentModelFlag != "PCDATA":
                     raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
-                if not isinstance(name, str):
+                if not isinstance(name, text_type):
                     raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                 # XXX: what to do with token["data"] ?
 

From fbbea1f614aaf69943c82271a37ec78623c362f7 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 01:34:28 +0000
Subject: [PATCH 02/16] Update lint filter for namespaced attributes

---
 html5lib/filters/lint.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index 9eee9cc5..74cdc859 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -33,11 +33,15 @@ def __iter__(self):
                     raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
                 if type == "StartTag":
                     open_elements.append(name)
-                for name, value in token["data"]:
-                    if not isinstance(name, text_type):
-                        raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
-                    if not name:
-                        raise LintError("Empty attribute name")
+                for (namespace, localname), value in token["data"].items():
+                    if namespace is not None and not isinstance(namespace, text_type):
+                        raise LintError("Attribute namespace is not a string or None: %(name)r" % {"name": namespace})
+                    if namespace == "":
+                        raise LintError("Empty attribute namespace")
+                    if not isinstance(localname, text_type):
+                        raise LintError("Attribute localname is not a string: %(name)r" % {"name": localname})
+                    if not localname:
+                        raise LintError("Empty attribute localname")
                     if not isinstance(value, text_type):
                         raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
                 if name in cdataElements:

From 8b4d7c45b3715a3ae22ef543ec5cdfe5c742792e Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 01:36:22 +0000
Subject: [PATCH 03/16] Drop the content model requirements from lint

---
 html5lib/filters/lint.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index 74cdc859..fc7c1ebe 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -3,7 +3,7 @@
 from six import text_type
 
 from . import _base
-from ..constants import cdataElements, rcdataElements, voidElements
+from ..constants import voidElements
 
 from ..constants import spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
@@ -16,13 +16,10 @@ class LintError(Exception):
 class Filter(_base.Filter):
     def __iter__(self):
         open_elements = []
-        contentModelFlag = "PCDATA"
         for token in _base.Filter.__iter__(self):
             type = token["type"]
             if type in ("StartTag", "EmptyTag"):
                 name = token["name"]
-                if contentModelFlag != "PCDATA":
-                    raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
                 if not isinstance(name, text_type):
                     raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                 if not name:
@@ -44,12 +41,6 @@ def __iter__(self):
                         raise LintError("Empty attribute localname")
                     if not isinstance(value, text_type):
                         raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
-                if name in cdataElements:
-                    contentModelFlag = "CDATA"
-                elif name in rcdataElements:
-                    contentModelFlag = "RCDATA"
-                elif name == "plaintext":
-                    contentModelFlag = "PLAINTEXT"
 
             elif type == "EndTag":
                 name = token["name"]
@@ -62,11 +53,9 @@ def __iter__(self):
                 start_name = open_elements.pop()
                 if start_name != name:
                     raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
-                contentModelFlag = "PCDATA"
 
             elif type == "Comment":
-                if contentModelFlag != "PCDATA":
-                    raise LintError("Comment not in PCDATA content model flag")
+                pass
 
             elif type in ("Characters", "SpaceCharacters"):
                 data = token["data"]
@@ -81,8 +70,6 @@ def __iter__(self):
 
             elif type == "Doctype":
                 name = token["name"]
-                if contentModelFlag != "PCDATA":
-                    raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
                 if not isinstance(name, text_type):
                     raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                 # XXX: what to do with token["data"] ?

From 270a2ca14fafc989f8f1bd4f79db2f4bd9f4d1fc Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:05:55 +0000
Subject: [PATCH 04/16] Don't let the lxml treewalker walk above the fragment
 root

---
 html5lib/treewalkers/lxmletree.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py
index 90e116d3..5c258a86 100644
--- a/html5lib/treewalkers/lxmletree.py
+++ b/html5lib/treewalkers/lxmletree.py
@@ -118,8 +118,10 @@ def __len__(self):
 class TreeWalker(_base.NonRecursiveTreeWalker):
     def __init__(self, tree):
         if hasattr(tree, "getroot"):
+            self.fragmentChildren = set()
             tree = Root(tree)
         elif isinstance(tree, list):
+            self.fragmentChildren = set(tree)
             tree = FragmentRoot(tree)
         _base.NonRecursiveTreeWalker.__init__(self, tree)
         self.filter = ihatexml.InfosetFilter()
@@ -197,5 +199,7 @@ def getParentNode(self, node):
             if key == "text":
                 return node
             # else: fallback to "normal" processing
+        elif node in self.fragmentChildren:
+            return None
 
         return node.getparent()

From 66ef02658ba79d5cffc65d71468da3b3d0b6398e Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:22:22 +0000
Subject: [PATCH 05/16] Teach lint & treewalkers that elements are only void in
 HTML ns

---
 html5lib/filters/lint.py             | 22 ++++++++++++++++------
 html5lib/treewalkers/_base.py        |  6 +++---
 html5lib/treewalkers/genshistream.py |  2 +-
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index fc7c1ebe..cc3e4ac4 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -3,7 +3,7 @@
 from six import text_type
 
 from . import _base
-from ..constants import voidElements
+from ..constants import namespaces, voidElements
 
 from ..constants import spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
@@ -19,17 +19,22 @@ def __iter__(self):
         for token in _base.Filter.__iter__(self):
             type = token["type"]
             if type in ("StartTag", "EmptyTag"):
+                namespace = token["namespace"]
                 name = token["name"]
+                if namespace is not None and not isinstance(namespace, text_type):
+                    raise LintError("Tag namespace is not a string or None: %(name)r" % {"name": namespace})
+                if namespace == "":
+                    raise LintError("Empty tag namespace")
                 if not isinstance(name, text_type):
                     raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                 if not name:
                     raise LintError("Empty tag name")
-                if type == "StartTag" and name in voidElements:
+                if type == "StartTag" and (not namespace or namespace == namespaces["html"]) and name in voidElements:
                     raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
-                elif type == "EmptyTag" and name not in voidElements:
+                elif type == "EmptyTag" and (not namespace or namespace == namespaces["html"]) and name not in voidElements:
                     raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
                 if type == "StartTag":
-                    open_elements.append(name)
+                    open_elements.append((namespace, name))
                 for (namespace, localname), value in token["data"].items():
                     if namespace is not None and not isinstance(namespace, text_type):
                         raise LintError("Attribute namespace is not a string or None: %(name)r" % {"name": namespace})
@@ -43,15 +48,20 @@ def __iter__(self):
                         raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
 
             elif type == "EndTag":
+                namespace = token["namespace"]
                 name = token["name"]
+                if namespace is not None and not isinstance(namespace, text_type):
+                    raise LintError("Tag namespace is not a string or None: %(name)r" % {"name": namespace})
+                if namespace == "":
+                    raise LintError("Empty tag namespace")
                 if not isinstance(name, text_type):
                     raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                 if not name:
                     raise LintError("Empty tag name")
-                if name in voidElements:
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                     raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
                 start_name = open_elements.pop()
-                if start_name != name:
+                if start_name != (namespace, name):
                     raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
 
             elif type == "Comment":
diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index e79a4357..271f45a0 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -2,7 +2,7 @@
 from six import text_type, string_types
 
 from xml.dom import Node
-from ..constants import voidElements, spaceCharacters
+from ..constants import namespaces, voidElements, spaceCharacters
 
 __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
            "TreeWalker", "NonRecursiveTreeWalker"]
@@ -154,7 +154,7 @@ def __iter__(self):
 
             elif type == ELEMENT:
                 namespace, name, attributes, hasChildren = details
-                if name in voidElements:
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                     for token in self.emptyTag(namespace, name, attributes,
                                                hasChildren):
                         yield token
@@ -187,7 +187,7 @@ def __iter__(self):
                     type, details = details[0], details[1:]
                     if type == ELEMENT:
                         namespace, name, attributes, hasChildren = details
-                        if name not in voidElements:
+                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
                             yield self.endTag(namespace, name)
                     if self.tree is currentNode:
                         currentNode = None
diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py
index 24d33282..83cd1654 100644
--- a/html5lib/treewalkers/genshistream.py
+++ b/html5lib/treewalkers/genshistream.py
@@ -48,7 +48,7 @@ def tokens(self, event, next):
         elif kind == END:
             name = data.localname
             namespace = data.namespace
-            if name not in voidElements:
+            if namespace != namespaces["html"] or name not in voidElements:
                 yield self.endTag(namespace, name)
 
         elif kind == COMMENT:

From 5bd341350b22a78295c9b2883b568774d15fadef Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:26:44 +0000
Subject: [PATCH 06/16] Use lint filter to ensure validity of treewalkers

---
 html5lib/tests/test_treewalkers.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index c79d0b1b..04a6cae4 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -14,6 +14,7 @@
 from .support import get_data_files, TestData, convertExpected
 
 from html5lib import html5parser, treewalkers, treebuilders, treeadapters, constants
+from html5lib.filters.lint import Filter as Lint
 
 
 treeTypes = {
@@ -91,7 +92,7 @@ def test_all_tokens(self):
             p = html5parser.HTMLParser(tree=treeCls["builder"])
             document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
             document = treeCls.get("adapter", lambda x: x)(document)
-            output = treeCls["walker"](document)
+            output = Lint(treeCls["walker"](document))
             for expectedToken, outputToken in zip(expected, output):
                 self.assertEqual(expectedToken, outputToken)
 
@@ -111,7 +112,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
 
     document = treeClass.get("adapter", lambda x: x)(document)
     try:
-        output = treewalkers.pprint(treeClass["walker"](document))
+        output = treewalkers.pprint(Lint(treeClass["walker"](document)))
         output = attrlist.sub(sortattrs, output)
         expected = attrlist.sub(sortattrs, convertExpected(expected))
         diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],

From fb9e1776a565ca157c33e4301891a58dee4337c4 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:27:56 +0000
Subject: [PATCH 07/16] Remove runtime type checks from treewalkers._base

---
 html5lib/treewalkers/_base.py | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index 271f45a0..dd6823dd 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -31,11 +31,6 @@ def to_text(s, blank_if_none=True):
         return text_type(s)
 
 
-def is_text_or_none(string):
-    """Wrapper around isinstance(string_types) or is None"""
-    return string is None or isinstance(string, string_types)
-
-
 class TreeWalker(object):
     def __init__(self, tree):
         self.tree = tree
@@ -47,13 +42,6 @@ def error(self, msg):
         return {"type": "SerializeError", "data": msg}
 
     def emptyTag(self, namespace, name, attrs, hasChildren=False):
-        assert namespace is None or isinstance(namespace, string_types), type(namespace)
-        assert isinstance(name, string_types), type(name)
-        assert all((namespace is None or isinstance(namespace, string_types)) and
-                   isinstance(name, string_types) and
-                   isinstance(value, string_types)
-                   for (namespace, name), value in attrs.items())
-
         yield {"type": "EmptyTag", "name": to_text(name, False),
                "namespace": to_text(namespace),
                "data": attrs}
@@ -61,13 +49,6 @@ def emptyTag(self, namespace, name, attrs, hasChildren=False):
             yield self.error("Void element has children")
 
     def startTag(self, namespace, name, attrs):
-        assert namespace is None or isinstance(namespace, string_types), type(namespace)
-        assert isinstance(name, string_types), type(name)
-        assert all((namespace is None or isinstance(namespace, string_types)) and
-                   isinstance(name, string_types) and
-                   isinstance(value, string_types)
-                   for (namespace, name), value in attrs.items())
-
         return {"type": "StartTag",
                 "name": text_type(name),
                 "namespace": to_text(namespace),
@@ -76,17 +57,12 @@ def startTag(self, namespace, name, attrs):
                              for (namespace, name), value in attrs.items())}
 
     def endTag(self, namespace, name):
-        assert namespace is None or isinstance(namespace, string_types), type(namespace)
-        assert isinstance(name, string_types), type(namespace)
-
         return {"type": "EndTag",
                 "name": to_text(name, False),
                 "namespace": to_text(namespace),
                 "data": {}}
 
     def text(self, data):
-        assert isinstance(data, string_types), type(data)
-
         data = to_text(data)
         middle = data.lstrip(spaceCharacters)
         left = data[:len(data) - len(middle)]
@@ -101,15 +77,9 @@ def text(self, data):
             yield {"type": "SpaceCharacters", "data": right}
 
     def comment(self, data):
-        assert isinstance(data, string_types), type(data)
-
         return {"type": "Comment", "data": text_type(data)}
 
     def doctype(self, name, publicId=None, systemId=None, correct=True):
-        assert is_text_or_none(name), type(name)
-        assert is_text_or_none(publicId), type(publicId)
-        assert is_text_or_none(systemId), type(systemId)
-
         return {"type": "Doctype",
                 "name": to_text(name),
                 "publicId": to_text(publicId),
@@ -117,8 +87,6 @@ def doctype(self, name, publicId=None, systemId=None, correct=True):
                 "correct": to_text(correct)}
 
     def entity(self, name):
-        assert isinstance(name, string_types), type(name)
-
         return {"type": "Entity", "name": text_type(name)}
 
     def unknown(self, nodeType):

From 2a5d7af11230225200cdaf101bb36980a8fd3f8e Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:34:39 +0000
Subject: [PATCH 08/16] Make sure we have the unicode from of text in lxml
 fragment root

---
 html5lib/treewalkers/lxmletree.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py
index 5c258a86..173fa082 100644
--- a/html5lib/treewalkers/lxmletree.py
+++ b/html5lib/treewalkers/lxmletree.py
@@ -139,7 +139,7 @@ def getNodeDetails(self, node):
             return _base.DOCTYPE, node.name, node.public_id, node.system_id
 
         elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
-            return _base.TEXT, node.obj
+            return _base.TEXT, ensure_str(node.obj)
 
         elif node.tag == etree.Comment:
             return _base.COMMENT, ensure_str(node.text)

From 9eff304ce8a230ecfe84a4c4fcb61b887bfcc551 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:35:13 +0000
Subject: [PATCH 09/16] Allow None as a doctype tagname in lint

---
 html5lib/filters/lint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index cc3e4ac4..9f99a876 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -80,8 +80,8 @@ def __iter__(self):
 
             elif type == "Doctype":
                 name = token["name"]
-                if not isinstance(name, text_type):
-                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
+                if name is not None and not isinstance(name, text_type):
+                    raise LintError("Tag name is not a string or None: %(tag)r" % {"tag": name})
                 # XXX: what to do with token["data"] ?
 
             elif type in ("ParseError", "SerializeError"):

From e0ea89948b80a300825b039fcfcda8ec4a13d513 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:39:38 +0000
Subject: [PATCH 10/16] Drop all the to_text magic in treewalkers._base

---
 html5lib/treewalkers/_base.py | 44 +++++++++++------------------------
 1 file changed, 14 insertions(+), 30 deletions(-)

diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index dd6823dd..6d0faef1 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -1,5 +1,4 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type, string_types
 
 from xml.dom import Node
 from ..constants import namespaces, voidElements, spaceCharacters
@@ -18,19 +17,6 @@
 spaceCharacters = "".join(spaceCharacters)
 
 
-def to_text(s, blank_if_none=True):
-    """Wrapper around six.text_type to convert None to empty string"""
-    if s is None:
-        if blank_if_none:
-            return ""
-        else:
-            return None
-    elif isinstance(s, text_type):
-        return s
-    else:
-        return text_type(s)
-
-
 class TreeWalker(object):
     def __init__(self, tree):
         self.tree = tree
@@ -42,28 +28,26 @@ def error(self, msg):
         return {"type": "SerializeError", "data": msg}
 
     def emptyTag(self, namespace, name, attrs, hasChildren=False):
-        yield {"type": "EmptyTag", "name": to_text(name, False),
-               "namespace": to_text(namespace),
+        yield {"type": "EmptyTag", "name": name,
+               "namespace": namespace,
                "data": attrs}
         if hasChildren:
             yield self.error("Void element has children")
 
     def startTag(self, namespace, name, attrs):
         return {"type": "StartTag",
-                "name": text_type(name),
-                "namespace": to_text(namespace),
-                "data": dict(((to_text(namespace, False), to_text(name)),
-                              to_text(value, False))
-                             for (namespace, name), value in attrs.items())}
+                "name": name,
+                "namespace": namespace,
+                "data": attrs}
 
     def endTag(self, namespace, name):
         return {"type": "EndTag",
-                "name": to_text(name, False),
-                "namespace": to_text(namespace),
+                "name": name,
+                "namespace": namespace,
                 "data": {}}
 
     def text(self, data):
-        data = to_text(data)
+        data = data
         middle = data.lstrip(spaceCharacters)
         left = data[:len(data) - len(middle)]
         if left:
@@ -77,17 +61,17 @@ def text(self, data):
             yield {"type": "SpaceCharacters", "data": right}
 
     def comment(self, data):
-        return {"type": "Comment", "data": text_type(data)}
+        return {"type": "Comment", "data": data}
 
     def doctype(self, name, publicId=None, systemId=None, correct=True):
         return {"type": "Doctype",
-                "name": to_text(name),
-                "publicId": to_text(publicId),
-                "systemId": to_text(systemId),
-                "correct": to_text(correct)}
+                "name": name,
+                "publicId": publicId,
+                "systemId": systemId,
+                "correct": correct}
 
     def entity(self, name):
-        return {"type": "Entity", "name": text_type(name)}
+        return {"type": "Entity", "name": name}
 
     def unknown(self, nodeType):
         return self.error("Unknown node type: " + nodeType)

From 22c2b1ac0fc9eb73aefde898f7b9c948e34dc041 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:52:28 +0000
Subject: [PATCH 11/16] Get rid of LintError and just use asserts

All of these properties should always hold per the API, so asserts
seem like a good match here.
---
 html5lib/filters/lint.py | 77 +++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 49 deletions(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index 9f99a876..e2434ef4 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -9,10 +9,6 @@
 spaceCharacters = "".join(spaceCharacters)
 
 
-class LintError(Exception):
-    pass
-
-
 class Filter(_base.Filter):
     def __iter__(self):
         open_elements = []
@@ -21,73 +17,56 @@ def __iter__(self):
             if type in ("StartTag", "EmptyTag"):
                 namespace = token["namespace"]
                 name = token["name"]
-                if namespace is not None and not isinstance(namespace, text_type):
-                    raise LintError("Tag namespace is not a string or None: %(name)r" % {"name": namespace})
-                if namespace == "":
-                    raise LintError("Empty tag namespace")
-                if not isinstance(name, text_type):
-                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
-                if not name:
-                    raise LintError("Empty tag name")
-                if type == "StartTag" and (not namespace or namespace == namespaces["html"]) and name in voidElements:
-                    raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
-                elif type == "EmptyTag" and (not namespace or namespace == namespaces["html"]) and name not in voidElements:
-                    raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                assert isinstance(token["data"], dict)
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert type == "EmptyTag"
+                else:
+                    assert type == "StartTag"
                 if type == "StartTag":
                     open_elements.append((namespace, name))
-                for (namespace, localname), value in token["data"].items():
-                    if namespace is not None and not isinstance(namespace, text_type):
-                        raise LintError("Attribute namespace is not a string or None: %(name)r" % {"name": namespace})
-                    if namespace == "":
-                        raise LintError("Empty attribute namespace")
-                    if not isinstance(localname, text_type):
-                        raise LintError("Attribute localname is not a string: %(name)r" % {"name": localname})
-                    if not localname:
-                        raise LintError("Empty attribute localname")
-                    if not isinstance(value, text_type):
-                        raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
+                for (namespace, name), value in token["data"].items():
+                    assert namespace is None or isinstance(namespace, text_type)
+                    assert namespace != ""
+                    assert isinstance(name, text_type)
+                    assert name != ""
+                    assert isinstance(value, text_type)
 
             elif type == "EndTag":
                 namespace = token["namespace"]
                 name = token["name"]
-                if namespace is not None and not isinstance(namespace, text_type):
-                    raise LintError("Tag namespace is not a string or None: %(name)r" % {"name": namespace})
-                if namespace == "":
-                    raise LintError("Empty tag namespace")
-                if not isinstance(name, text_type):
-                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
-                if not name:
-                    raise LintError("Empty tag name")
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
                 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
-                    raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
-                start_name = open_elements.pop()
-                if start_name != (namespace, name):
-                    raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
+                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
+                else:
+                    start = open_elements.pop()
+                    assert start == (namespace, name)
 
             elif type == "Comment":
                 pass
 
             elif type in ("Characters", "SpaceCharacters"):
                 data = token["data"]
-                if not isinstance(data, text_type):
-                    raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
-                if not data:
-                    raise LintError("%(type)s token with empty data" % {"type": type})
+                assert isinstance(data, text_type)
+                assert data != ""
                 if type == "SpaceCharacters":
-                    data = data.strip(spaceCharacters)
-                    if data:
-                        raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
+                    assert data.strip(spaceCharacters) == ""
 
             elif type == "Doctype":
                 name = token["name"]
-                if name is not None and not isinstance(name, text_type):
-                    raise LintError("Tag name is not a string or None: %(tag)r" % {"tag": name})
+                assert name is None or isinstance(name, text_type)
                 # XXX: what to do with token["data"] ?
 
             elif type in ("ParseError", "SerializeError"):
                 pass
 
             else:
-                raise LintError("Unknown token type: %(type)s" % {"type": type})
+                assert False, "Unknown token type: %(type)s" % {"type": type}
 
             yield token

From 5336ebea678f099f5def28ffe3924c41c6de782d Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:54:53 +0000
Subject: [PATCH 12/16] Lint that comments are text_type

---
 html5lib/filters/lint.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index e2434ef4..be51b852 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -49,7 +49,8 @@ def __iter__(self):
                     assert start == (namespace, name)
 
             elif type == "Comment":
-                pass
+                data = token["data"]
+                assert isinstance(data, text_type)
 
             elif type in ("Characters", "SpaceCharacters"):
                 data = token["data"]

From dc879ffaab0455e8974ceaac40b727e5a04c1175 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:55:06 +0000
Subject: [PATCH 13/16] Don't allow ParseError/SerializerError tokens, whatever
 they are!

---
 html5lib/filters/lint.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index be51b852..076dbc54 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -64,9 +64,6 @@ def __iter__(self):
                 assert name is None or isinstance(name, text_type)
                 # XXX: what to do with token["data"] ?
 
-            elif type in ("ParseError", "SerializeError"):
-                pass
-
             else:
                 assert False, "Unknown token type: %(type)s" % {"type": type}
 

From 7f8bd13cc2d6e334d898c64afecf4b1bf64c5f93 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:55:32 +0000
Subject: [PATCH 14/16] Drop end tag tree walker's data (always empty now)

---
 html5lib/tests/test_treewalkers.py | 8 ++++----
 html5lib/treewalkers/_base.py      | 3 +--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 04a6cae4..e59f25ea 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -78,15 +78,15 @@ def test_all_tokens(self):
         expected = [
             {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
             {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
-            {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
+            {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
             {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
             {'data': 'a', 'type': 'Characters'},
             {'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
             {'data': 'b', 'type': 'Characters'},
-            {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
+            {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
             {'data': 'c', 'type': 'Characters'},
-            {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
-            {'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
+            {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
+            {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
         ]
         for treeName, treeCls in sorted(treeTypes.items()):
             p = html5parser.HTMLParser(tree=treeCls["builder"])
diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index 6d0faef1..bf66ec71 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -43,8 +43,7 @@ def startTag(self, namespace, name, attrs):
     def endTag(self, namespace, name):
         return {"type": "EndTag",
                 "name": name,
-                "namespace": namespace,
-                "data": {}}
+                "namespace": namespace}
 
     def text(self, data):
         data = data

From c335295f6b9d0b0710b86d94f79494cc676deb70 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 02:57:59 +0000
Subject: [PATCH 15/16] Drop tree walker doctype correct flag, whatever that
 once was!

---
 html5lib/treewalkers/_base.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index bf66ec71..36e1ba24 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -62,12 +62,11 @@ def text(self, data):
     def comment(self, data):
         return {"type": "Comment", "data": data}
 
-    def doctype(self, name, publicId=None, systemId=None, correct=True):
+    def doctype(self, name, publicId=None, systemId=None):
         return {"type": "Doctype",
                 "name": name,
                 "publicId": publicId,
-                "systemId": systemId,
-                "correct": correct}
+                "systemId": systemId}
 
     def entity(self, name):
         return {"type": "Entity", "name": name}

From ca6591cca342065305949189f5adbc741f76fe9b Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Wed, 16 Dec 2015 03:55:12 +0000
Subject: [PATCH 16/16] Make sure lint is testing everything treewalkers can
 do.

---
 html5lib/filters/lint.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
index 076dbc54..3ec63d72 100644
--- a/html5lib/filters/lint.py
+++ b/html5lib/filters/lint.py
@@ -62,7 +62,14 @@ def __iter__(self):
             elif type == "Doctype":
                 name = token["name"]
                 assert name is None or isinstance(name, text_type)
-                # XXX: what to do with token["data"] ?
+                assert token["publicId"] is None or isinstance(name, text_type)
+                assert token["systemId"] is None or isinstance(name, text_type)
+
+            elif type == "Entity":
+                assert isinstance(token["name"], text_type)
+
+            elif type == "SerializerError":
+                assert isinstance(token["data"], text_type)
 
             else:
                 assert False, "Unknown token type: %(type)s" % {"type": type}