diff --git a/.pytest.expect b/.pytest.expect index e818b516..bb5398d1 100644 Binary files a/.pytest.expect and b/.pytest.expect differ diff --git a/html5lib/constants.py b/html5lib/constants.py index f6e38cbf..dbef76f1 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -283,6 +283,8 @@ "Element %(name)s not allowed in a non-html context", "unexpected-end-tag-before-html": "Unexpected end tag (%(name)s) before html.", + "unexpected-inhead-noscript-tag": + "Element %(name)s not allowed in a inhead-noscript context", "XXX-undefined-error": "Undefined error (this sucks and should be fixed)", } diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index a7cb98be..b1e41029 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -22,18 +22,18 @@ def parse(doc, treebuilder="etree", encoding=None, - namespaceHTMLElements=True): + namespaceHTMLElements=True, scripting=False): """Parse a string or file-like object into a tree""" tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) - return p.parse(doc, encoding=encoding) + return p.parse(doc, encoding=encoding, scripting=scripting) def parseFragment(doc, container="div", treebuilder="etree", encoding=None, - namespaceHTMLElements=True): + namespaceHTMLElements=True, scripting=False): tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) - return p.parseFragment(doc, container=container, encoding=encoding) + return p.parseFragment(doc, container=container, encoding=encoding, scripting=scripting) def method_decorator_metaclass(function): @@ -78,11 +78,12 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer, self.phases = dict([(name, cls(self, self.tree)) for name, cls in getPhases(debug).items()]) - def _parse(self, stream, innerHTML=False, container="div", - encoding=None, parseMeta=True, useChardet=True, **kwargs): + def _parse(self, stream, innerHTML=False, container="div", encoding=None, + parseMeta=True, useChardet=True, scripting=False, **kwargs): self.innerHTMLMode = innerHTML self.container = container + self.scripting = scripting self.tokenizer = self.tokenizer_class(stream, encoding=encoding, parseMeta=parseMeta, useChardet=useChardet, @@ -222,7 +223,8 @@ def normalizedTokens(self): for token in self.tokenizer: yield self.normalizeToken(token) - def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): + def parse(self, stream, encoding=None, parseMeta=True, + useChardet=True, scripting=False): """Parse a HTML document into a well-formed tree stream - a filelike object or string containing the HTML to be parsed @@ -231,13 +233,15 @@ def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) + + scripting - treat noscript elements as if javascript was turned on """ self._parse(stream, innerHTML=False, encoding=encoding, - parseMeta=parseMeta, useChardet=useChardet) + parseMeta=parseMeta, useChardet=useChardet, scripting=scripting) return self.tree.getDocument() def parseFragment(self, stream, container="div", encoding=None, - parseMeta=False, useChardet=True): + parseMeta=False, useChardet=True, scripting=False): """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -249,8 +253,11 @@ def parseFragment(self, stream, container="div", encoding=None, the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) + + scripting - treat noscript elements as if javascript was turned on """ - self._parse(stream, True, container=container, encoding=encoding) + self._parse(stream, True, container=container, + encoding=encoding, scripting=scripting) return self.tree.getFragment() def parseError(self, errorcode="XXX-undefined-error", datavars={}): @@ -708,7 +715,8 @@ def __init__(self, parser, tree): self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), - (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), + (("noframes", "style"), self.startTagNoFramesStyle), + ("noscript", self.startTagNoscript), ("script", self.startTagScript), (("base", "basefont", "bgsound", "command", "link"), self.startTagBaseLinkCommand), @@ -717,7 +725,7 @@ def __init__(self, parser, tree): ]) self.startTagHandler.default = self.startTagOther - self. endTagHandler = utils.MethodDispatcher([ + self.endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), (("br", "html", "body"), self.endTagHtmlBodyBr) ]) @@ -767,10 +775,17 @@ def startTagMeta(self, token): def startTagTitle(self, token): self.parser.parseRCDataRawtext(token, "RCDATA") - def startTagNoScriptNoFramesStyle(self, token): + def startTagNoFramesStyle(self, token): # Need to decide whether to implement the scripting-disabled case self.parser.parseRCDataRawtext(token, "RAWTEXT") + def startTagNoscript(self, token): + if self.parser.scripting: + self.parser.parseRCDataRawtext(token, "RAWTEXT") + else: + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inHeadNoscript"] + def startTagScript(self, token): self.tree.insertElement(token) self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState @@ -796,10 +811,51 @@ def endTagOther(self, token): def anythingElse(self): self.endTagHead(impliedTagToken("head")) - # XXX If we implement a parser for which scripting is disabled we need to - # implement this phase. - # - # class InHeadNoScriptPhase(Phase): + class InHeadNoscriptPhase(Phase): + def __init__(self, parser, tree): + Phase.__init__(self, parser, tree) + + self.startTagHandler = utils.MethodDispatcher([ + ("html", self.startTagHtml), + (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand), + (("head", "noscript"), self.startTagHeadNoscript), + ]) + self.startTagHandler.default = self.startTagOther + + self.endTagHandler = utils.MethodDispatcher([ + ("noscript", self.endTagNoscript), + ("br", self.endTagBr), + ]) + self.endTagHandler.default = self.endTagOther + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBaseLinkCommand(self, token): + return self.parser.phases["inHead"].startTagBaseLinkCommand(token) + + def startTagHeadNoscript(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + return self.anythingElse(token) + + def endTagNoscript(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "noscript", "Expected noscript got %s" % node.name + self.parser.phase = self.parser.phases["inHead"] + + def endTagBr(self, token): + return self.anythingElse(token) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.endTagNoscript(impliedTagToken("noscript")) + return token + class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) @@ -910,7 +966,8 @@ def __init__(self, parser, tree): ("isindex", self.startTagIsIndex), ("textarea", self.startTagTextarea), ("iframe", self.startTagIFrame), - (("noembed", "noframes", "noscript"), self.startTagRawtext), + ("noscript", self.startTagNoscript), + (("noembed", "noframes"), self.startTagRawtext), ("select", self.startTagSelect), (("rp", "rt"), self.startTagRpRt), (("option", "optgroup"), self.startTagOpt), @@ -1231,6 +1288,12 @@ def startTagIFrame(self, token): self.parser.framesetOK = False self.startTagRawtext(token) + def startTagNoscript(self, token): + if self.parser.scripting: + self.startTagRawtext(token) + else: + self.startTagOther(token) + def startTagRawtext(self, token): """iframe, noembed noframes, noscript(if scripting enabled)""" self.parser.parseRCDataRawtext(token, "RAWTEXT") @@ -2687,7 +2750,7 @@ def processEndTag(self, token): "beforeHtml": BeforeHtmlPhase, "beforeHead": BeforeHeadPhase, "inHead": InHeadPhase, - # XXX "inHeadNoscript": InHeadNoScriptPhase, + "inHeadNoscript": InHeadNoscriptPhase, "afterHead": AfterHeadPhase, "inBody": InBodyPhase, "text": TextPhase, diff --git a/html5lib/tests/tree_construction.py b/html5lib/tests/tree_construction.py index c1125387..608be2b8 100644 --- a/html5lib/tests/tree_construction.py +++ b/html5lib/tests/tree_construction.py @@ -51,14 +51,17 @@ def runtest(self): fragmentContainer = self.test['document-fragment'] expected = self.test['document'] expectedErrors = self.test['errors'].split("\n") if self.test['errors'] else [] + scripting = False + if 'script-on' in self.test: + scripting = True with warnings.catch_warnings(): warnings.simplefilter("error") try: if fragmentContainer: - document = p.parseFragment(input, fragmentContainer) + document = p.parseFragment(input, fragmentContainer, scripting=scripting) else: - document = p.parse(input) + document = p.parse(input, scripting=scripting) except constants.DataLossWarning: pytest.skip("data loss warning")