-
Notifications
You must be signed in to change notification settings - Fork 294
Implement inhead-noscript context, add script parameter to parse #230
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,18 +22,18 @@ | |
|
||
|
||
def parse(doc, treebuilder="etree", encoding=None, | ||
namespaceHTMLElements=True): | ||
namespaceHTMLElements=True, script=True): | ||
"""Parse a string or file-like object into a tree""" | ||
tb = treebuilders.getTreeBuilder(treebuilder) | ||
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) | ||
return p.parse(doc, encoding=encoding) | ||
return p.parse(doc, encoding=encoding, script=script) | ||
|
||
|
||
def parseFragment(doc, container="div", treebuilder="etree", encoding=None, | ||
namespaceHTMLElements=True): | ||
namespaceHTMLElements=True, script=True): | ||
tb = treebuilders.getTreeBuilder(treebuilder) | ||
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) | ||
return p.parseFragment(doc, container=container, encoding=encoding) | ||
return p.parseFragment(doc, container=container, encoding=encoding, script=script) | ||
|
||
|
||
def method_decorator_metaclass(function): | ||
|
@@ -78,11 +78,12 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer, | |
self.phases = dict([(name, cls(self, self.tree)) for name, cls in | ||
getPhases(debug).items()]) | ||
|
||
def _parse(self, stream, innerHTML=False, container="div", | ||
encoding=None, parseMeta=True, useChardet=True, **kwargs): | ||
def _parse(self, stream, innerHTML=False, container="div", encoding=None, | ||
parseMeta=True, useChardet=True, script=True, **kwargs): | ||
|
||
self.innerHTMLMode = innerHTML | ||
self.container = container | ||
self.scriptMode = script | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A real nit that doesn't really block anything, but given it really should be seconds to fix it: I'd prefer |
||
self.tokenizer = self.tokenizer_class(stream, encoding=encoding, | ||
parseMeta=parseMeta, | ||
useChardet=useChardet, | ||
|
@@ -222,7 +223,8 @@ def normalizedTokens(self): | |
for token in self.tokenizer: | ||
yield self.normalizeToken(token) | ||
|
||
def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): | ||
def parse(self, stream, encoding=None, parseMeta=True, | ||
useChardet=True, script=True): | ||
"""Parse a HTML document into a well-formed tree | ||
|
||
stream - a filelike object or string containing the HTML to be parsed | ||
|
@@ -231,13 +233,15 @@ def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): | |
the encoding. If specified, that encoding will be used, | ||
regardless of any BOM or later declaration (such as in a meta | ||
element) | ||
|
||
script - treat noscript elements as if javascript was turned on | ||
""" | ||
self._parse(stream, innerHTML=False, encoding=encoding, | ||
parseMeta=parseMeta, useChardet=useChardet) | ||
parseMeta=parseMeta, useChardet=useChardet, script=script) | ||
return self.tree.getDocument() | ||
|
||
def parseFragment(self, stream, container="div", encoding=None, | ||
parseMeta=False, useChardet=True): | ||
parseMeta=False, useChardet=True, script=True): | ||
"""Parse a HTML fragment into a well-formed tree fragment | ||
|
||
container - name of the element we're setting the innerHTML property | ||
|
@@ -249,8 +253,11 @@ def parseFragment(self, stream, container="div", encoding=None, | |
the encoding. If specified, that encoding will be used, | ||
regardless of any BOM or later declaration (such as in a meta | ||
element) | ||
|
||
script - treat noscript elements as if javascript was turned on | ||
""" | ||
self._parse(stream, True, container=container, encoding=encoding) | ||
self._parse(stream, True, container=container, | ||
encoding=encoding, script=script) | ||
return self.tree.getFragment() | ||
|
||
def parseError(self, errorcode="XXX-undefined-error", datavars={}): | ||
|
@@ -708,7 +715,8 @@ def __init__(self, parser, tree): | |
self.startTagHandler = utils.MethodDispatcher([ | ||
("html", self.startTagHtml), | ||
("title", self.startTagTitle), | ||
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), | ||
(("noframes", "style"), self.startTagNoFramesStyle), | ||
("noscript", self.startTagNoscript), | ||
("script", self.startTagScript), | ||
(("base", "basefont", "bgsound", "command", "link"), | ||
self.startTagBaseLinkCommand), | ||
|
@@ -717,7 +725,7 @@ def __init__(self, parser, tree): | |
]) | ||
self.startTagHandler.default = self.startTagOther | ||
|
||
self. endTagHandler = utils.MethodDispatcher([ | ||
self.endTagHandler = utils.MethodDispatcher([ | ||
("head", self.endTagHead), | ||
(("br", "html", "body"), self.endTagHtmlBodyBr) | ||
]) | ||
|
@@ -767,10 +775,17 @@ def startTagMeta(self, token): | |
def startTagTitle(self, token): | ||
self.parser.parseRCDataRawtext(token, "RCDATA") | ||
|
||
def startTagNoScriptNoFramesStyle(self, token): | ||
def startTagNoFramesStyle(self, token): | ||
# Need to decide whether to implement the scripting-disabled case | ||
self.parser.parseRCDataRawtext(token, "RAWTEXT") | ||
|
||
def startTagNoscript(self, token): | ||
if self.parser.scriptMode: | ||
self.parser.parseRCDataRawtext(token, "RAWTEXT") | ||
else: | ||
self.tree.insertElement(token) | ||
self.parser.phase = self.parser.phases["inHeadNoscript"] | ||
|
||
def startTagScript(self, token): | ||
self.tree.insertElement(token) | ||
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState | ||
|
@@ -796,10 +811,51 @@ def endTagOther(self, token): | |
def anythingElse(self): | ||
self.endTagHead(impliedTagToken("head")) | ||
|
||
# XXX If we implement a parser for which scripting is disabled we need to | ||
# implement this phase. | ||
# | ||
# class InHeadNoScriptPhase(Phase): | ||
class InHeadNoscriptPhase(Phase): | ||
def __init__(self, parser, tree): | ||
Phase.__init__(self, parser, tree) | ||
|
||
self.startTagHandler = utils.MethodDispatcher([ | ||
("html", self.startTagHtml), | ||
(("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand), | ||
(("head", "noscript"), self.startTagHeadNoscript), | ||
]) | ||
self.startTagHandler.default = self.startTagOther | ||
|
||
self.endTagHandler = utils.MethodDispatcher([ | ||
("noscript", self.endTagNoscript), | ||
("br", self.endTagBr), | ||
]) | ||
self.endTagHandler.default = self.endTagOther | ||
|
||
def startTagHtml(self, token): | ||
return self.parser.phases["inBody"].processStartTag(token) | ||
|
||
def startTagBaseLinkCommand(self, token): | ||
return self.parser.phases["inHead"].startTagBaseLinkCommand(token) | ||
|
||
def startTagHeadNoscript(self, token): | ||
self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) | ||
|
||
def startTagOther(self, token): | ||
return self.anythingElse(token) | ||
|
||
def endTagNoscript(self, token): | ||
node = self.parser.tree.openElements.pop() | ||
assert node.name == "noscript", "Expected noscript got %s" % node.name | ||
self.parser.phase = self.parser.phases["inHead"] | ||
|
||
def endTagBr(self, token): | ||
return self.anythingElse(token) | ||
|
||
def endTagOther(self, token): | ||
self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) | ||
|
||
def anythingElse(self, token): | ||
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) | ||
self.endTagNoscript(impliedTagToken("noscript")) | ||
return token | ||
|
||
class AfterHeadPhase(Phase): | ||
def __init__(self, parser, tree): | ||
Phase.__init__(self, parser, tree) | ||
|
@@ -910,7 +966,8 @@ def __init__(self, parser, tree): | |
("isindex", self.startTagIsIndex), | ||
("textarea", self.startTagTextarea), | ||
("iframe", self.startTagIFrame), | ||
(("noembed", "noframes", "noscript"), self.startTagRawtext), | ||
("noscript", self.startTagNoscript), | ||
(("noembed", "noframes"), self.startTagRawtext), | ||
("select", self.startTagSelect), | ||
(("rp", "rt"), self.startTagRpRt), | ||
(("option", "optgroup"), self.startTagOpt), | ||
|
@@ -1231,6 +1288,12 @@ def startTagIFrame(self, token): | |
self.parser.framesetOK = False | ||
self.startTagRawtext(token) | ||
|
||
def startTagNoscript(self, token): | ||
if self.parser.scriptMode: | ||
self.startTagRawtext(token) | ||
else: | ||
self.startTagOther(token) | ||
|
||
def startTagRawtext(self, token): | ||
"""iframe, noembed noframes, noscript(if scripting enabled)""" | ||
self.parser.parseRCDataRawtext(token, "RAWTEXT") | ||
|
@@ -2687,7 +2750,7 @@ def processEndTag(self, token): | |
"beforeHtml": BeforeHtmlPhase, | ||
"beforeHead": BeforeHeadPhase, | ||
"inHead": InHeadPhase, | ||
# XXX "inHeadNoscript": InHeadNoScriptPhase, | ||
"inHeadNoscript": InHeadNoscriptPhase, | ||
"afterHead": AfterHeadPhase, | ||
"inBody": InBodyPhase, | ||
"text": TextPhase, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,14 +51,17 @@ def runtest(self): | |
fragmentContainer = self.test['document-fragment'] | ||
expected = self.test['document'] | ||
expectedErrors = self.test['errors'].split("\n") if self.test['errors'] else [] | ||
script = True | ||
if 'script-off' in self.test: | ||
script = False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if this is a horrible suggestion, but I wonder if it's worthwhile doing something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This tests only There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right. But we're never going to pass anything containing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Huh. Is that supposed to pass? Tests under tree-construction/scripted/ contain neither There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uh, IIRC everything under |
||
|
||
with warnings.catch_warnings(): | ||
warnings.simplefilter("error") | ||
try: | ||
if fragmentContainer: | ||
document = p.parseFragment(input, fragmentContainer) | ||
document = p.parseFragment(input, fragmentContainer, script=script) | ||
else: | ||
document = p.parse(input) | ||
document = p.parse(input, script=script) | ||
except constants.DataLossWarning: | ||
pytest.skip("data loss warning") | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Bikeshed moment: Can we call this
scripting
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not a problem. I had doubts naming it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, I'd rather we default to False, given we don't actually implement scripting.