Skip to content

Lint fixes #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jan 12, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 44 additions & 57 deletions html5lib/filters/lint.py
Original file line number Diff line number Diff line change
@@ -1,90 +1,77 @@
from __future__ import absolute_import, division, unicode_literals

from six import text_type

from . import _base
from ..constants import cdataElements, rcdataElements, voidElements
from ..constants import namespaces, voidElements

from ..constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters)


class LintError(Exception):
pass


class Filter(_base.Filter):
def __iter__(self):
open_elements = []
contentModelFlag = "PCDATA"
for token in _base.Filter.__iter__(self):
type = token["type"]
if type in ("StartTag", "EmptyTag"):
namespace = token["namespace"]
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
if not isinstance(name, str):
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name:
raise LintError("Empty tag name")
if type == "StartTag" and name in voidElements:
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
elif type == "EmptyTag" and name not in voidElements:
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
assert isinstance(token["data"], dict)
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert type == "EmptyTag"
else:
assert type == "StartTag"
if type == "StartTag":
open_elements.append(name)
for name, value in token["data"]:
if not isinstance(name, str):
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
if not name:
raise LintError("Empty attribute name")
if not isinstance(value, str):
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
contentModelFlag = "RCDATA"
elif name == "plaintext":
contentModelFlag = "PLAINTEXT"
open_elements.append((namespace, name))
for (namespace, name), value in token["data"].items():
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
assert isinstance(value, text_type)

elif type == "EndTag":
namespace = token["namespace"]
name = token["name"]
if not isinstance(name, str):
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
if not name:
raise LintError("Empty tag name")
if name in voidElements:
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
start_name = open_elements.pop()
if start_name != name:
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
contentModelFlag = "PCDATA"
assert namespace is None or isinstance(namespace, text_type)
assert namespace != ""
assert isinstance(name, text_type)
assert name != ""
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
else:
start = open_elements.pop()
assert start == (namespace, name)

elif type == "Comment":
if contentModelFlag != "PCDATA":
raise LintError("Comment not in PCDATA content model flag")
data = token["data"]
assert isinstance(data, text_type)

elif type in ("Characters", "SpaceCharacters"):
data = token["data"]
if not isinstance(data, str):
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
if not data:
raise LintError("%(type)s token with empty data" % {"type": type})
assert isinstance(data, text_type)
assert data != ""
if type == "SpaceCharacters":
data = data.strip(spaceCharacters)
if data:
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
assert data.strip(spaceCharacters) == ""

elif type == "Doctype":
name = token["name"]
if contentModelFlag != "PCDATA":
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
if not isinstance(name, str):
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
# XXX: what to do with token["data"] ?
assert name is None or isinstance(name, text_type)
assert token["publicId"] is None or isinstance(name, text_type)
assert token["systemId"] is None or isinstance(name, text_type)

elif type == "Entity":
assert isinstance(token["name"], text_type)

elif type in ("ParseError", "SerializeError"):
pass
elif type == "SerializerError":
assert isinstance(token["data"], text_type)

else:
raise LintError("Unknown token type: %(type)s" % {"type": type})
assert False, "Unknown token type: %(type)s" % {"type": type}

yield token
13 changes: 7 additions & 6 deletions html5lib/tests/test_treewalkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .support import get_data_files, TestData, convertExpected

from html5lib import html5parser, treewalkers, treebuilders, treeadapters, constants
from html5lib.filters.lint import Filter as Lint


treeTypes = {
Expand Down Expand Up @@ -77,21 +78,21 @@ def test_all_tokens(self):
expected = [
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'data': 'a', 'type': 'Characters'},
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'data': 'b', 'type': 'Characters'},
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
{'data': 'c', 'type': 'Characters'},
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
]
for treeName, treeCls in sorted(treeTypes.items()):
p = html5parser.HTMLParser(tree=treeCls["builder"])
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
document = treeCls.get("adapter", lambda x: x)(document)
output = treeCls["walker"](document)
output = Lint(treeCls["walker"](document))
for expectedToken, outputToken in zip(expected, output):
self.assertEqual(expectedToken, outputToken)

Expand All @@ -111,7 +112,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):

document = treeClass.get("adapter", lambda x: x)(document)
try:
output = treewalkers.pprint(treeClass["walker"](document))
output = treewalkers.pprint(Lint(treeClass["walker"](document)))
output = attrlist.sub(sortattrs, output)
expected = attrlist.sub(sortattrs, convertExpected(expected))
diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],
Expand Down
84 changes: 17 additions & 67 deletions html5lib/treewalkers/_base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type, string_types

from xml.dom import Node
from ..constants import voidElements, spaceCharacters
from ..constants import namespaces, voidElements, spaceCharacters

__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
"TreeWalker", "NonRecursiveTreeWalker"]
Expand All @@ -18,24 +17,6 @@
spaceCharacters = "".join(spaceCharacters)


def to_text(s, blank_if_none=True):
"""Wrapper around six.text_type to convert None to empty string"""
if s is None:
if blank_if_none:
return ""
else:
return None
elif isinstance(s, text_type):
return s
else:
return text_type(s)


def is_text_or_none(string):
"""Wrapper around isinstance(string_types) or is None"""
return string is None or isinstance(string, string_types)


class TreeWalker(object):
def __init__(self, tree):
self.tree = tree
Expand All @@ -47,47 +28,25 @@ def error(self, msg):
return {"type": "SerializeError", "data": msg}

def emptyTag(self, namespace, name, attrs, hasChildren=False):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(name)
assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())

yield {"type": "EmptyTag", "name": to_text(name, False),
"namespace": to_text(namespace),
yield {"type": "EmptyTag", "name": name,
"namespace": namespace,
"data": attrs}
if hasChildren:
yield self.error("Void element has children")

def startTag(self, namespace, name, attrs):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(name)
assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())

return {"type": "StartTag",
"name": text_type(name),
"namespace": to_text(namespace),
"data": dict(((to_text(namespace, False), to_text(name)),
to_text(value, False))
for (namespace, name), value in attrs.items())}
"name": name,
"namespace": namespace,
"data": attrs}

def endTag(self, namespace, name):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(namespace)

return {"type": "EndTag",
"name": to_text(name, False),
"namespace": to_text(namespace),
"data": {}}
"name": name,
"namespace": namespace}

def text(self, data):
assert isinstance(data, string_types), type(data)

data = to_text(data)
data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
if left:
Expand All @@ -101,25 +60,16 @@ def text(self, data):
yield {"type": "SpaceCharacters", "data": right}

def comment(self, data):
assert isinstance(data, string_types), type(data)

return {"type": "Comment", "data": text_type(data)}

def doctype(self, name, publicId=None, systemId=None, correct=True):
assert is_text_or_none(name), type(name)
assert is_text_or_none(publicId), type(publicId)
assert is_text_or_none(systemId), type(systemId)
return {"type": "Comment", "data": data}

def doctype(self, name, publicId=None, systemId=None):
return {"type": "Doctype",
"name": to_text(name),
"publicId": to_text(publicId),
"systemId": to_text(systemId),
"correct": to_text(correct)}
"name": name,
"publicId": publicId,
"systemId": systemId}

def entity(self, name):
assert isinstance(name, string_types), type(name)

return {"type": "Entity", "name": text_type(name)}
return {"type": "Entity", "name": name}

def unknown(self, nodeType):
return self.error("Unknown node type: " + nodeType)
Expand Down Expand Up @@ -154,7 +104,7 @@ def __iter__(self):

elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name in voidElements:
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
Expand Down Expand Up @@ -187,7 +137,7 @@ def __iter__(self):
type, details = details[0], details[1:]
if type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name not in voidElements:
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
Expand Down
2 changes: 1 addition & 1 deletion html5lib/treewalkers/genshistream.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def tokens(self, event, next):
elif kind == END:
name = data.localname
namespace = data.namespace
if name not in voidElements:
if namespace != namespaces["html"] or name not in voidElements:
yield self.endTag(namespace, name)

elif kind == COMMENT:
Expand Down
6 changes: 5 additions & 1 deletion html5lib/treewalkers/lxmletree.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,10 @@ def __len__(self):
class TreeWalker(_base.NonRecursiveTreeWalker):
def __init__(self, tree):
if hasattr(tree, "getroot"):
self.fragmentChildren = set()
tree = Root(tree)
elif isinstance(tree, list):
self.fragmentChildren = set(tree)
tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = ihatexml.InfosetFilter()
Expand All @@ -137,7 +139,7 @@ def getNodeDetails(self, node):
return _base.DOCTYPE, node.name, node.public_id, node.system_id

elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
return _base.TEXT, node.obj
return _base.TEXT, ensure_str(node.obj)

elif node.tag == etree.Comment:
return _base.COMMENT, ensure_str(node.text)
Expand Down Expand Up @@ -197,5 +199,7 @@ def getParentNode(self, node):
if key == "text":
return node
# else: fallback to "normal" processing
elif node in self.fragmentChildren:
return None

return node.getparent()