diff --git a/.prospector.yaml b/.prospector.yaml new file mode 100644 index 00000000..7e8efe1a --- /dev/null +++ b/.prospector.yaml @@ -0,0 +1,21 @@ +strictness: veryhigh +doc-warnings: false +test-warnings: false + +max-line-length: 139 + +requirements: + - requirements.txt + - requirements-test.txt + - requirements-optional.txt + +ignore-paths: + - parse.py + - utils/ + +python-targets: + - 2 + - 3 + +mccabe: + run: false diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..ea74d5db --- /dev/null +++ b/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +ignore=tests + +[MESSAGES CONTROL] +# messages up to fixme should probably be fixed somehow +disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda + +[FORMAT] +max-line-length=139 +single-line-if-stmt=no diff --git a/flake8-run.sh b/flake8-run.sh index 685ec6ab..d9264946 100755 --- a/flake8-run.sh +++ b/flake8-run.sh @@ -5,8 +5,5 @@ if [[ ! -x $(which flake8) ]]; then exit 1 fi -find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501 -flake1=$? -flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py -flake2=$? -exit $[$flake1 || $flake2] +flake8 `dirname $0` +exit $? diff --git a/html5lib/constants.py b/html5lib/constants.py index 2244933c..df1f061e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -2819,7 +2819,6 @@ 0x0d: "\u000D", 0x80: "\u20AC", 0x81: "\u0081", - 0x81: "\u0081", 0x82: "\u201A", 0x83: "\u0192", 0x84: "\u201E", diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index caddd318..7f81c0d1 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -765,15 +765,15 @@ def sanitize_token(self, token): if ((namespace, name) in self.allowed_elements or (namespace is None and (namespaces["html"], name) in self.allowed_elements)): - return self.allowed_token(token, token_type) + return self.allowed_token(token) else: - return self.disallowed_token(token, token_type) + return self.disallowed_token(token) elif token_type == "Comment": pass else: return token - def allowed_token(self, token, token_type): + def allowed_token(self, token): if "data" in token: attrs = token["data"] attr_names = set(attrs.keys()) @@ -823,7 +823,8 @@ def allowed_token(self, token, token_type): token["data"] = attrs return token - def disallowed_token(self, token, token_type): + def disallowed_token(self, token): + token_type = token["type"] if token_type == "EndTag": token["data"] = "%s>" % token["name"] elif token["data"]: @@ -862,7 +863,7 @@ def sanitize_css(self, style): 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): + not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index e6808425..331b8fd7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -121,7 +121,7 @@ def reset(self): self.phase.insertHtmlElement() self.resetInsertionMode() else: - self.innerHTML = False + self.innerHTML = False # pylint:disable=redefined-variable-type self.phase = self.phases["initial"] self.lastPhase = None @@ -241,6 +241,7 @@ def parse(self, stream, encoding=None, parseMeta=True, def parseFragment(self, stream, container="div", encoding=None, parseMeta=False, useChardet=True, scripting=False): + # pylint:disable=unused-argument """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -259,8 +260,10 @@ def parseFragment(self, stream, container="div", encoding=None, encoding=encoding, scripting=scripting) return self.tree.getFragment() - def parseError(self, errorcode="XXX-undefined-error", datavars={}): + def parseError(self, errorcode="XXX-undefined-error", datavars=None): # XXX The idea is to make errorcode mandatory. + if datavars is None: + datavars = {} self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError(E[errorcode] % datavars) @@ -361,6 +364,7 @@ def adjustForeignAttributes(self, token): del token["data"][originalName] def reparseTokenNormal(self, token): + # pylint:disable=unused-argument self.parser.phase() def resetInsertionMode(self): @@ -458,6 +462,7 @@ def getMetaclass(use_metaclass, metaclass_func): else: return type + # pylint:disable=unused-argument class Phase(with_metaclass(getMetaclass(debug, log))): """Base class for helper object that implements each phase of processing """ @@ -948,8 +953,8 @@ class InBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - # Keep a ref to this for special handling of whitespace in
- self.processSpaceCharactersNonPre = self.processSpaceCharacters + # Set this to the default handler + self.processSpaceCharacters = self.processSpaceCharactersNonPre self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), @@ -1082,7 +1087,7 @@ def processCharacters(self, token): for char in token["data"]])): self.parser.framesetOK = False - def processSpaceCharacters(self, token): + def processSpaceCharactersNonPre(self, token): self.tree.reconstructActiveFormattingElements() self.tree.insertText(token["data"]) @@ -2763,6 +2768,7 @@ def startTagOther(self, token): def processEndTag(self, token): self.parser.parseError("expected-eof-but-got-end-tag", {"name": token["name"]}) + # pylint:enable=unused-argument return { "initial": InitialPhase, diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py index 5da5d938..d6d1d6fb 100644 --- a/html5lib/ihatexml.py +++ b/html5lib/ihatexml.py @@ -175,9 +175,9 @@ def escapeRegexp(string): return string # output from the above -nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') +nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa -nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') +nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa # Simpler things nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]") @@ -186,7 +186,7 @@ def escapeRegexp(string): class InfosetFilter(object): replacementRegexp = re.compile(r"U[\dA-F]{5,5}") - def __init__(self, replaceChars=None, + def __init__(self, dropXmlnsLocalName=False, dropXmlnsAttrNs=False, preventDoubleDashComments=False, @@ -217,7 +217,7 @@ def coerceAttribute(self, name, namespace=None): else: return self.toXmlName(name) - def coerceElement(self, name, namespace=None): + def coerceElement(self, name): return self.toXmlName(name) def coerceComment(self, data): @@ -232,7 +232,7 @@ def coerceComment(self, data): def coerceCharacters(self, data): if self.replaceFormFeedCharacters: - for i in range(data.count("\x0C")): + for _ in range(data.count("\x0C")): warnings.warn("Text cannot contain U+000C", DataLossWarning) data = data.replace("\x0C", " ") # Other non-xml characters diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 15acba0d..58d626c9 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -19,12 +19,6 @@ except ImportError: BytesIO = StringIO -try: - from io import BufferedIOBase -except ImportError: - class BufferedIOBase(object): - pass - # Non-unicode versions of constants for use in the pre-parser spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters]) asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters]) @@ -32,15 +26,17 @@ class BufferedIOBase(object): spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" +invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa if utils.supports_lone_surrogates: # Use one extra step of indirection and create surrogates with - # unichr. Not using this indirection would introduce an illegal + # eval. Not using this indirection would introduce an illegal # unicode literal on platforms not supporting such lone # surrogates. - invalid_unicode_re = re.compile(invalid_unicode_no_surrogate + - eval('"\\uD800-\\uDFFF"')) + assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1 + invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] + + eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used + "]") else: invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) @@ -296,7 +292,7 @@ def readChunk(self, chunkSize=None): return True def characterErrorsUCS4(self, data): - for i in range(len(invalid_unicode_re.findall(data))): + for _ in range(len(invalid_unicode_re.findall(data))): self.errors.append("invalid-codepoint") def characterErrorsUCS2(self, data): @@ -453,7 +449,7 @@ def openStream(self, source): try: stream.seek(stream.tell()) - except: + except: # pylint:disable=bare-except stream = BufferedStream(stream) return stream @@ -571,6 +567,7 @@ def __new__(self, value): return bytes.__new__(self, value.lower()) def __init__(self, value): + # pylint:disable=unused-argument self._position = -1 def __iter__(self): @@ -681,7 +678,7 @@ def getEncoding(self): (b" 1) or - (not is_ucs4 and len(v) > 2)): - continue - if v != "&": - if len(v) == 2: - v = utils.surrogatePairToCodepoint(v) - else: - v = ord(v) - if v not in encode_entity_map or k.islower(): - # prefer < over < and similarly for &, >, etc. - encode_entity_map[v] = k - - def htmlentityreplace_errors(exc): - if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): - res = [] - codepoints = [] - skip = False - for i, c in enumerate(exc.object[exc.start:exc.end]): - if skip: - skip = False - continue - index = i + exc.start - if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): - codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) - skip = True - else: - codepoint = ord(c) - codepoints.append(codepoint) - for cp in codepoints: - e = encode_entity_map.get(cp) - if e: - res.append("&") - res.append(e) - if not e.endswith(";"): - res.append(";") - else: - res.append("%s;" % (hex(cp)[2:])) - return ("".join(res), exc.end) - else: - return xmlcharrefreplace_errors(exc) - register_error(unicode_encode_errors, htmlentityreplace_errors) +encode_entity_map = {} +is_ucs4 = len("\U0010FFFF") == 1 +for k, v in list(entities.items()): + # skip multi-character entities + if ((is_ucs4 and len(v) > 1) or + (not is_ucs4 and len(v) > 2)): + continue + if v != "&": + if len(v) == 2: + v = utils.surrogatePairToCodepoint(v) + else: + v = ord(v) + if v not in encode_entity_map or k.islower(): + # prefer < over < and similarly for &, >, etc. + encode_entity_map[v] = k + + +def htmlentityreplace_errors(exc): + if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): + res = [] + codepoints = [] + skip = False + for i, c in enumerate(exc.object[exc.start:exc.end]): + if skip: + skip = False + continue + index = i + exc.start + if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): + codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2]) + skip = True + else: + codepoint = ord(c) + codepoints.append(codepoint) + for cp in codepoints: + e = encode_entity_map.get(cp) + if e: + res.append("&") + res.append(e) + if not e.endswith(";"): + res.append(";") + else: + res.append("%s;" % (hex(cp)[2:])) + return ("".join(res), exc.end) + else: + return xmlcharrefreplace_errors(exc) - del register_error +register_error("htmlentityreplace", htmlentityreplace_errors) class HTMLSerializer(object): @@ -168,7 +163,7 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) if self.encoding: - return string.encode(self.encoding, unicode_encode_errors) + return string.encode(self.encoding, "htmlentityreplace") else: return string @@ -180,6 +175,7 @@ def encodeStrict(self, string): return string def serialize(self, treewalker, encoding=None): + # pylint:disable=too-many-nested-blocks self.encoding = encoding in_cdata = False self.errors = [] @@ -241,7 +237,7 @@ def serialize(self, treewalker, encoding=None): in_cdata = True elif in_cdata: self.serializeError("Unexpected child element of a CDATA element") - for (attr_namespace, attr_name), attr_value in token["data"].items(): + for (_, attr_name), attr_value in token["data"].items(): # TODO: Add namespace support here k = attr_name v = attr_value @@ -328,6 +324,6 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): raise SerializeError -def SerializeError(Exception): +class SerializeError(Exception): """Error in serialized tree""" pass diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py index 6e6a916b..6ae09dbe 100644 --- a/html5lib/tests/support.py +++ b/html5lib/tests/support.py @@ -1,5 +1,7 @@ from __future__ import absolute_import, division, unicode_literals +# pylint:disable=wrong-import-position + import os import sys import codecs @@ -13,7 +15,7 @@ os.path.pardir, os.path.pardir))) -from html5lib import treebuilders, treewalkers, treeadapters +from html5lib import treebuilders, treewalkers, treeadapters # noqa del base_path # Build a dict of available trees @@ -26,14 +28,14 @@ } # ElementTree impls -import xml.etree.ElementTree as ElementTree +import xml.etree.ElementTree as ElementTree # noqa treeTypes['ElementTree'] = { "builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True), "walker": treewalkers.getTreeWalker("etree", ElementTree) } try: - import xml.etree.cElementTree as cElementTree + import xml.etree.cElementTree as cElementTree # noqa except ImportError: treeTypes['cElementTree'] = None else: @@ -47,7 +49,7 @@ } try: - import lxml.etree as lxml # flake8: noqa + import lxml.etree as lxml # noqa except ImportError: treeTypes['lxml'] = None else: @@ -58,7 +60,7 @@ # Genshi impls try: - import genshi # flake8: noqa + import genshi # noqa except ImportError: pass else: @@ -68,6 +70,8 @@ "walker": treewalkers.getTreeWalker("genshi") } +# pylint:enable=wrong-import-position + def get_data_files(subdirectory, files='*.dat', search_dir=test_dir): return sorted(glob.glob(os.path.join(search_dir, subdirectory, files))) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 09504654..c5d2af12 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -51,19 +51,21 @@ def runPreScanEncodingTest(data, encoding): def test_encoding(): for filename in get_data_files("encoding"): tests = _TestData(filename, b"data", encoding=None) - for idx, test in enumerate(tests): + for test in tests: yield (runParserEncodingTest, test[b'data'], test[b'encoding']) yield (runPreScanEncodingTest, test[b'data'], test[b'encoding']) +# pylint:disable=wrong-import-position try: try: - import charade # flake8: noqa + import charade # noqa except ImportError: - import chardet # flake8: noqa + import chardet # noqa except ImportError: print("charade/chardet not found, skipping chardet tests") else: def test_chardet(): - with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp: + with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp: encoding = inputstream.HTMLInputStream(fp.read()).charEncoding assert encoding[0].name == "big5" +# pylint:enable=wrong-import-position diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 2f3ba2c8..f8e1ac43 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -2,10 +2,8 @@ import io -import pytest +from . import support # noqa -from . import support # flake8: noqa -from html5lib import html5parser from html5lib.constants import namespaces from html5lib import parse @@ -23,29 +21,29 @@ def test_line_counter(): def test_namespace_html_elements_0_dom(): doc = parse("", - treebuilder="dom", - namespaceHTMLElements=True) + treebuilder="dom", + namespaceHTMLElements=True) assert doc.childNodes[0].namespaceURI == namespaces["html"] def test_namespace_html_elements_1_dom(): doc = parse("", - treebuilder="dom", - namespaceHTMLElements=False) + treebuilder="dom", + namespaceHTMLElements=False) assert doc.childNodes[0].namespaceURI is None def test_namespace_html_elements_0_etree(): doc = parse("", - treebuilder="etree", - namespaceHTMLElements=True) + treebuilder="etree", + namespaceHTMLElements=True) assert doc.tag == "{%s}html" % (namespaces["html"],) def test_namespace_html_elements_1_etree(): doc = parse("", - treebuilder="etree", - namespaceHTMLElements=False) + treebuilder="etree", + namespaceHTMLElements=False) assert doc.tag == "html" diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index 1f8a06f6..e19deea8 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -4,7 +4,7 @@ from html5lib.filters import sanitizer -def runSanitizerTest(name, expected, input): +def runSanitizerTest(_, expected, input): parsed = parseFragment(expected) expected = serialize(parsed, omit_optional_tags=False, @@ -63,7 +63,8 @@ def test_sanitizer(): for ns, tag_name in sanitizer.allowed_elements: if ns != constants.namespaces["html"]: continue - if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'select']: + if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', + 'tfoot', 'th', 'thead', 'tr', 'select']: continue # TODO if tag_name == 'image': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py index b3ffe0df..b3cda7d7 100644 --- a/html5lib/tests/test_serializer.py +++ b/html5lib/tests/test_serializer.py @@ -12,6 +12,7 @@ from html5lib.serializer import HTMLSerializer, serialize from html5lib.treewalkers._base import TreeWalker +# pylint:disable=wrong-import-position optionals_loaded = [] try: @@ -19,6 +20,7 @@ optionals_loaded.append("lxml") except ImportError: pass +# pylint:enable=wrong-import-position default_namespace = constants.namespaces["html"] @@ -219,5 +221,5 @@ def test_serializer(): for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)): with open(filename) as fp: tests = json.load(fp) - for index, test in enumerate(tests['tests']): + for test in tests['tests']: yield runSerializerTest, test["input"], test["expected"], test.get("options", {}) diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py index 3b659fbb..77e411d5 100644 --- a/html5lib/tests/test_stream.py +++ b/html5lib/tests/test_stream.py @@ -1,15 +1,20 @@ from __future__ import absolute_import, division, unicode_literals -from . import support # flake8: noqa +from . import support # noqa + import codecs -from io import BytesIO -import socket +import sys +from io import BytesIO, StringIO + +import pytest import six from six.moves import http_client, urllib from html5lib.inputstream import (BufferedStream, HTMLInputStream, HTMLUnicodeInputStream, HTMLBinaryInputStream) +from html5lib.utils import supports_lone_surrogates + def test_basic(): s = b"abc" @@ -17,6 +22,7 @@ def test_basic(): read = fp.read(10) assert read == s + def test_read_length(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) @@ -28,17 +34,23 @@ def test_read_length(): read4 = fp.read(4) assert read4 == b"" + def test_tell(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) + assert read1 == b"a" assert fp.tell() == 1 read2 = fp.read(2) + assert read2 == b"bc" assert fp.tell() == 3 read3 = fp.read(3) + assert read3 == b"def" assert fp.tell() == 6 read4 = fp.read(4) + assert read4 == b"" assert fp.tell() == 6 + def test_seek(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) @@ -55,20 +67,26 @@ def test_seek(): read5 = fp.read(2) assert read5 == b"ef" + def test_seek_tell(): fp = BufferedStream(BytesIO(b"abcdef")) read1 = fp.read(1) + assert read1 == b"a" assert fp.tell() == 1 fp.seek(0) read2 = fp.read(1) + assert read2 == b"a" assert fp.tell() == 1 read3 = fp.read(2) + assert read3 == b"bc" assert fp.tell() == 3 fp.seek(2) read4 = fp.read(2) + assert read4 == b"cd" assert fp.tell() == 4 fp.seek(4) read5 = fp.read(2) + assert read5 == b"ef" assert fp.tell() == 6 @@ -85,11 +103,13 @@ def test_char_ascii(): assert stream.charEncoding[0].name == 'windows-1252' assert stream.char() == "'" + def test_char_utf8(): stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8') assert stream.charEncoding[0].name == 'utf-8' assert stream.char() == '\u2018' + def test_char_win1252(): stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252')) assert stream.charEncoding[0].name == 'windows-1252' @@ -97,16 +117,19 @@ def test_char_win1252(): assert stream.char() == "\xf1" assert stream.char() == "\u2019" + def test_bom(): stream = HTMLInputStream(codecs.BOM_UTF8 + b"'") assert stream.charEncoding[0].name == 'utf-8' assert stream.char() == "'" + def test_utf_16(): stream = HTMLInputStream((' ' * 1025).encode('utf-16')) assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be'] assert len(stream.charsUntil(' ', True)) == 1025 + def test_newlines(): stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe") assert stream.position() == (1, 0) @@ -117,11 +140,13 @@ def test_newlines(): assert stream.charsUntil('e') == "x" assert stream.position() == (4, 5) + def test_newlines2(): size = HTMLUnicodeInputStream._defaultChunkSize stream = HTMLInputStream("\r" * size + "\n") assert stream.charsUntil('x') == "\n" * size + def test_position(): stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh") assert stream.position() == (1, 0) @@ -140,6 +165,7 @@ def test_position(): assert stream.charsUntil('h') == "e\nf\ng" assert stream.position() == (6, 1) + def test_position2(): stream = HTMLUnicodeInputStreamShortChunk("abc\nd") assert stream.position() == (1, 0) @@ -154,6 +180,7 @@ def test_position2(): assert stream.char() == "d" assert stream.position() == (2, 1) + def test_python_issue_20007(): """ Make sure we have a work-around for Python bug #20007 @@ -161,6 +188,7 @@ def test_python_issue_20007(): """ class FakeSocket(object): def makefile(self, _mode, _bufsize=None): + # pylint:disable=unused-argument return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") source = http_client.HTTPResponse(FakeSocket()) @@ -168,6 +196,7 @@ def makefile(self, _mode, _bufsize=None): stream = HTMLInputStream(source) assert stream.charsUntil(" ") == "Text" + def test_python_issue_20007_b(): """ Make sure we have a work-around for Python bug #20007 @@ -178,6 +207,7 @@ def test_python_issue_20007_b(): class FakeSocket(object): def makefile(self, _mode, _bufsize=None): + # pylint:disable=unused-argument return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText") source = http_client.HTTPResponse(FakeSocket()) @@ -185,3 +215,109 @@ def makefile(self, _mode, _bufsize=None): wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com") stream = HTMLInputStream(wrapped) assert stream.charsUntil(" ") == "Text" + + +@pytest.mark.parametrize("inp,num", + [("\u0000", 0), + ("\u0001", 1), + ("\u0008", 1), + ("\u0009", 0), + ("\u000A", 0), + ("\u000B", 1), + ("\u000C", 0), + ("\u000D", 0), + ("\u000E", 1), + ("\u001F", 1), + ("\u0020", 0), + ("\u007E", 0), + ("\u007F", 1), + ("\u009F", 1), + ("\u00A0", 0), + ("\uFDCF", 0), + ("\uFDD0", 1), + ("\uFDEF", 1), + ("\uFDF0", 0), + ("\uFFFD", 0), + ("\uFFFE", 1), + ("\uFFFF", 1), + ("\U0001FFFD", 0), + ("\U0001FFFE", 1), + ("\U0001FFFF", 1), + ("\U0002FFFD", 0), + ("\U0002FFFE", 1), + ("\U0002FFFF", 1), + ("\U0003FFFD", 0), + ("\U0003FFFE", 1), + ("\U0003FFFF", 1), + ("\U0004FFFD", 0), + ("\U0004FFFE", 1), + ("\U0004FFFF", 1), + ("\U0005FFFD", 0), + ("\U0005FFFE", 1), + ("\U0005FFFF", 1), + ("\U0006FFFD", 0), + ("\U0006FFFE", 1), + ("\U0006FFFF", 1), + ("\U0007FFFD", 0), + ("\U0007FFFE", 1), + ("\U0007FFFF", 1), + ("\U0008FFFD", 0), + ("\U0008FFFE", 1), + ("\U0008FFFF", 1), + ("\U0009FFFD", 0), + ("\U0009FFFE", 1), + ("\U0009FFFF", 1), + ("\U000AFFFD", 0), + ("\U000AFFFE", 1), + ("\U000AFFFF", 1), + ("\U000BFFFD", 0), + ("\U000BFFFE", 1), + ("\U000BFFFF", 1), + ("\U000CFFFD", 0), + ("\U000CFFFE", 1), + ("\U000CFFFF", 1), + ("\U000DFFFD", 0), + ("\U000DFFFE", 1), + ("\U000DFFFF", 1), + ("\U000EFFFD", 0), + ("\U000EFFFE", 1), + ("\U000EFFFF", 1), + ("\U000FFFFD", 0), + ("\U000FFFFE", 1), + ("\U000FFFFF", 1), + ("\U0010FFFD", 0), + ("\U0010FFFE", 1), + ("\U0010FFFF", 1), + ("\x01\x01\x01", 3), + ("a\x01a\x01a\x01a", 3)]) +def test_invalid_codepoints(inp, num): + stream = HTMLUnicodeInputStream(StringIO(inp)) + for _i in range(len(inp)): + stream.char() + assert len(stream.errors) == num + + +@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates") +@pytest.mark.parametrize("inp,num", + [("'\\uD7FF'", 0), + ("'\\uD800'", 1), + ("'\\uDBFF'", 1), + ("'\\uDC00'", 1), + ("'\\uDFFF'", 1), + ("'\\uE000'", 0), + ("'\\uD800\\uD800\\uD800'", 3), + ("'a\\uD800a\\uD800a\\uD800a'", 3), + ("'\\uDFFF\\uDBFF'", 2), + pytest.mark.skipif(sys.maxunicode == 0xFFFF, + ("'\\uDBFF\\uDFFF'", 2), + reason="narrow Python")]) +def test_invalid_codepoints_surrogates(inp, num): + inp = eval(inp) # pylint:disable=eval-used + fp = StringIO(inp) + if ord(max(fp.read())) > 0xFFFF: + pytest.skip("StringIO altered string") + fp.seek(0) + stream = HTMLUnicodeInputStream(fp) + for _i in range(len(inp)): + stream.char() + assert len(stream.errors) == num diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py index 5f38b6c3..95e56c00 100644 --- a/html5lib/tests/test_treeadapters.py +++ b/html5lib/tests/test_treeadapters.py @@ -1,6 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -from . import support # flake8: noqa +from . import support # noqa import html5lib from html5lib.treeadapters import sax @@ -25,7 +25,7 @@ def test_to_sax(): ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'), ('characters', '\n '), ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'), - ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), + ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}), ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}), diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 045d9d7b..332027ac 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -31,7 +31,7 @@ def test_all_tokens(): {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'}, {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'} ] - for treeName, treeCls in sorted(treeTypes.items()): + for _, treeCls in sorted(treeTypes.items()): if treeCls is None: continue p = html5parser.HTMLParser(tree=treeCls["builder"]) diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py index c6163a1f..255c1859 100644 --- a/html5lib/tests/tokenizer.py +++ b/html5lib/tests/tokenizer.py @@ -19,6 +19,7 @@ def __init__(self, initialState, lastStartTag=None): self._lastStartTag = lastStartTag def parse(self, stream, encoding=None, innerHTML=False): + # pylint:disable=unused-argument tokenizer = self.tokenizer(stream, encoding) self.outputTokens = [] diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 79774578..dd6ea75f 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -1,9 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -try: - chr = unichr # flake8: noqa -except NameError: - pass +from six import unichr as chr from collections import deque @@ -147,8 +144,8 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False): output = "&" charStack = [self.stream.char()] - if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") - or (allowedChar is not None and allowedChar == charStack[0])): + if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or + (allowedChar is not None and allowedChar == charStack[0])): self.stream.unget(charStack[0]) elif charStack[0] == "#": @@ -924,7 +921,7 @@ def attributeNameState(self): if self.lowercaseAttrName: self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) - for name, value in self.currentToken["data"][:-1]: + for name, _ in self.currentToken["data"][:-1]: if self.currentToken["data"][-1][0] == name: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "duplicate-attribute"}) @@ -1716,11 +1713,11 @@ def cdataSectionState(self): else: data.append(char) - data = "".join(data) + data = "".join(data) # pylint:disable=redefined-variable-type # Deal with null here rather than in the parser nullCount = data.count("\u0000") if nullCount > 0: - for i in range(nullCount): + for _ in range(nullCount): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) data = data.replace("\u0000", "\uFFFD") diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py index 57d71304..4f978466 100644 --- a/html5lib/treeadapters/__init__.py +++ b/html5lib/treeadapters/__init__.py @@ -5,7 +5,7 @@ __all__ = ["sax"] try: - from . import genshi # flake8: noqa + from . import genshi # noqa except ImportError: pass else: diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py index 8196f591..900a724c 100644 --- a/html5lib/treebuilders/_base.py +++ b/html5lib/treebuilders/_base.py @@ -126,6 +126,7 @@ class TreeBuilder(object): commentClass - the class to use for comments doctypeClass - the class to use for doctypes """ + # pylint:disable=not-callable # Document class documentClass = None diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index 8656244f..b7df74b2 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -109,7 +109,7 @@ def getNameTuple(self): nameTuple = property(getNameTuple) - class TreeBuilder(_base.TreeBuilder): + class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable def documentClass(self): self.dom = Dom.getDOMImplementation().createDocument(None, None, None) return weakref.proxy(self) @@ -158,6 +158,7 @@ def insertText(self, data, parent=None): else: # HACK: allow text nodes as children of the document node if hasattr(self.dom, '_child_node_types'): + # pylint:disable=protected-access if Node.TEXT_NODE not in self.dom._child_node_types: self.dom._child_node_types = list(self.dom._child_node_types) self.dom._child_node_types.append(Node.TEXT_NODE) diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 2c8ed19f..d394148d 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -1,4 +1,6 @@ from __future__ import absolute_import, division, unicode_literals +# pylint:disable=protected-access + from six import text_type import re @@ -253,7 +255,7 @@ def serializeElement(element, indent=0): return "\n".join(rv) - def tostring(element): + def tostring(element): # pylint:disable=unused-variable """Serialize an element and its child nodes to a string""" rv = [] filter = ihatexml.InfosetFilter() @@ -307,7 +309,7 @@ def serializeElement(element): return "".join(rv) - class TreeBuilder(_base.TreeBuilder): + class TreeBuilder(_base.TreeBuilder): # pylint:disable=unused-variable documentClass = Document doctypeClass = DocumentType elementClass = Element diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py index 138b30bd..2a69769b 100644 --- a/html5lib/treebuilders/etree_lxml.py +++ b/html5lib/treebuilders/etree_lxml.py @@ -10,6 +10,7 @@ """ from __future__ import absolute_import, division, unicode_literals +# pylint:disable=protected-access import warnings import re @@ -53,7 +54,6 @@ def _getChildNodes(self): def testSerializer(element): rv = [] - finalText = None infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) def serializeElement(element, indent=0): @@ -128,16 +128,12 @@ def serializeElement(element, indent=0): rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) serializeElement(element, 0) - if finalText is not None: - rv.append("|%s\"%s\"" % (' ' * 2, finalText)) - return "\n".join(rv) def tostring(element): """Serialize an element and its child nodes to a string""" rv = [] - finalText = None def serializeElement(element): if not hasattr(element, "tag"): @@ -173,9 +169,6 @@ def serializeElement(element): serializeElement(element) - if finalText is not None: - rv.append("%s\"" % (' ' * 2, finalText)) - return "".join(rv) @@ -193,9 +186,11 @@ def __init__(self, namespaceHTMLElements, fullTree=False): self.namespaceHTMLElements = namespaceHTMLElements class Attributes(dict): - def __init__(self, element, value={}): + def __init__(self, element, value=None): + if value is None: + value = {} self._element = element - dict.__init__(self, value) + dict.__init__(self, value) # pylint:disable=non-parent-init-called for key, value in self.items(): if isinstance(key, tuple): name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) @@ -303,12 +298,14 @@ def insertDoctype(self, token): self.doctype = doctype def insertCommentInitial(self, data, parent=None): + assert parent is None or parent is self.document + assert self.document._elementTree is None self.initial_comments.append(data) def insertCommentMain(self, data, parent=None): if (parent == self.document and self.document._elementTree.getroot()[-1].tag == comment_type): - warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) + warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) super(TreeBuilder, self).insertComment(data, parent) def insertRoot(self, token): diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index 73c8e26a..d3b0c50e 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag - class TreeWalker(_base.NonRecursiveTreeWalker): + class TreeWalker(_base.NonRecursiveTreeWalker): # pylint:disable=unused-variable """Given the particular ElementTree representation, this implementation, to avoid using recursion, returns "nodes" as tuples with the following content: @@ -38,7 +38,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker): """ def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element - elt, key, parents, flag = node + elt, _, _, flag = node if flag in ("text", "tail"): return _base.TEXT, getattr(elt, flag) else: diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py index 83cd1654..61cbfede 100644 --- a/html5lib/treewalkers/genshistream.py +++ b/html5lib/treewalkers/genshistream.py @@ -25,7 +25,7 @@ def __iter__(self): yield token def tokens(self, event, next): - kind, data, pos = event + kind, data, _ = event if kind == START: tag, attribs = data name = tag.localname diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py index 36850086..7d99adc2 100644 --- a/html5lib/treewalkers/lxmletree.py +++ b/html5lib/treewalkers/lxmletree.py @@ -117,6 +117,7 @@ def __len__(self): class TreeWalker(_base.NonRecursiveTreeWalker): def __init__(self, tree): + # pylint:disable=redefined-variable-type if hasattr(tree, "getroot"): self.fragmentChildren = set() tree = Root(tree) diff --git a/html5lib/trie/__init__.py b/html5lib/trie/__init__.py index a8cca8a9..a5ba4bf1 100644 --- a/html5lib/trie/__init__.py +++ b/html5lib/trie/__init__.py @@ -4,9 +4,11 @@ Trie = PyTrie +# pylint:disable=wrong-import-position try: from .datrie import Trie as DATrie except ImportError: pass else: Trie = DATrie +# pylint:enable=wrong-import-position diff --git a/html5lib/trie/_base.py b/html5lib/trie/_base.py index 724486b1..25eece46 100644 --- a/html5lib/trie/_base.py +++ b/html5lib/trie/_base.py @@ -7,7 +7,8 @@ class Trie(Mapping): """Abstract base class for tries""" def keys(self, prefix=None): - keys = super().keys() + # pylint:disable=arguments-differ + keys = super(Trie, self).keys() if prefix is None: return set(keys) diff --git a/html5lib/utils.py b/html5lib/utils.py index c70de172..5fe237a0 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -22,12 +22,12 @@ # surrogates, and there is no mechanism to further escape such # escapes. try: - _x = eval('"\\uD800"') + _x = eval('"\\uD800"') # pylint:disable=eval-used if not isinstance(_x, text_type): # We need this with u"" because of http://bugs.jython.org/issue2039 - _x = eval('u"\\uD800"') + _x = eval('u"\\uD800"') # pylint:disable=eval-used assert isinstance(_x, text_type) -except: +except: # pylint:disable=bare-except supports_lone_surrogates = False else: supports_lone_surrogates = True @@ -52,7 +52,7 @@ def __init__(self, items=()): # anything here. _dictEntries = [] for name, value in items: - if type(name) in (list, tuple, frozenset, set): + if isinstance(name, (list, tuple, frozenset, set)): for item in name: _dictEntries.append((item, value)) else: diff --git a/parse.py b/parse.py index cceea84d..2ed8f1c2 100755 --- a/parse.py +++ b/parse.py @@ -5,7 +5,6 @@ """ import sys -import os import traceback from optparse import OptionParser @@ -15,9 +14,10 @@ from html5lib import constants from html5lib import utils + def parse(): optParser = getOptParser() - opts,args = optParser.parse_args() + opts, args = optParser.parse_args() encoding = "utf8" try: @@ -25,7 +25,10 @@ def parse(): # Try opening from the internet if f.startswith('http://'): try: - import urllib.request, urllib.parse, urllib.error, cgi + import urllib.request + import urllib.parse + import urllib.error + import cgi f = urllib.request.urlopen(f) contentType = f.headers.get('content-type') if contentType: @@ -41,7 +44,7 @@ def parse(): try: # Try opening from file system f = open(f, "rb") - except IOError as e: + except IOError as e: sys.stderr.write("Unable to open file: %s\n" % e) sys.exit(1) except IndexError: @@ -82,14 +85,15 @@ def parse(): if document: printOutput(p, document, opts) t2 = time.time() - sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)) + sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1)) else: - sys.stderr.write("\n\nRun took: %fs"%(t1-t0)) + sys.stderr.write("\n\nRun took: %fs" % (t1 - t0)) else: document = run(parseMethod, f, encoding, opts.scripting) if document: printOutput(p, document, opts) + def run(parseMethod, f, encoding, scripting): try: document = parseMethod(f, encoding=encoding, scripting=scripting) @@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting): traceback.print_exc() return document + def printOutput(parser, document, opts): if opts.encoding: print("Encoding:", parser.tokenizer.stream.charEncoding) @@ -116,7 +121,7 @@ def printOutput(parser, document, opts): elif tb == "etree": sys.stdout.write(utils.default_etree.tostring(document)) elif opts.tree: - if not hasattr(document,'__getitem__'): + if not hasattr(document, '__getitem__'): document = [document] for fragment in document: print(parser.tree.testSerializer(fragment)) @@ -126,7 +131,7 @@ def printOutput(parser, document, opts): kwargs = {} for opt in serializer.HTMLSerializer.options: try: - kwargs[opt] = getattr(opts,opt) + kwargs[opt] = getattr(opts, opt) except: pass if not kwargs['quote_char']: @@ -142,12 +147,14 @@ def printOutput(parser, document, opts): encoding = "utf-8" for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding): sys.stdout.write(text) - if not text.endswith('\n'): sys.stdout.write('\n') + if not text.endswith('\n'): + sys.stdout.write('\n') if opts.error: - errList=[] + errList = [] for pos, errorcode, datavars in parser.errors: - errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) - sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n") + errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars) + sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n") + def getOptParser(): parser = OptionParser(usage=__doc__) diff --git a/setup.cfg b/setup.cfg index 2a9acf13..3152ac54 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,11 @@ [bdist_wheel] universal = 1 + +[pep8] +ignore = N +max-line-length = 139 +exclude = .git,__pycache__,.tox,doc + +[flake8] +ignore = N +max-line-length = 139 diff --git a/setup.py b/setup.py index b6ea24af..b42ba400 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup -classifiers=[ +classifiers = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', @@ -20,9 +20,9 @@ 'Programming Language :: Python :: 3.5', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Text Processing :: Markup :: HTML' - ] +] -packages = ['html5lib'] + ['html5lib.'+name +packages = ['html5lib'] + ['html5lib.' + name for name in os.listdir(os.path.join('html5lib')) if os.path.isdir(os.path.join('html5lib', name)) and not name.startswith('.') and name != 'tests'] @@ -39,9 +39,9 @@ assignments = filter(lambda x: isinstance(x, ast.Assign), t.body) for a in assignments: if (len(a.targets) == 1 and - isinstance(a.targets[0], ast.Name) and - a.targets[0].id == "__version__" and - isinstance(a.value, ast.Str)): + isinstance(a.targets[0], ast.Name) and + a.targets[0].id == "__version__" and + isinstance(a.value, ast.Str)): version = a.value.s setup(name='html5lib', diff --git a/utils/entities.py b/utils/entities.py index 116a27cb..6dccf5f0 100644 --- a/utils/entities.py +++ b/utils/entities.py @@ -2,50 +2,59 @@ import html5lib + def parse(path="html5ents.xml"): return html5lib.parse(open(path), treebuilder="lxml") + def entity_table(tree): return dict((entity_name("".join(tr[0].xpath(".//text()"))), entity_characters(tr[1].text)) for tr in tree.xpath("//h:tbody/h:tr", - namespaces={"h":"http://www.w3.org/1999/xhtml"})) + namespaces={"h": "http://www.w3.org/1999/xhtml"})) + def entity_name(inp): return inp.strip() + def entity_characters(inp): return "".join(codepoint_to_character(item) - for item in inp.split() - if item) + for item in inp.split() + if item) + def codepoint_to_character(inp): - return ("\U000"+inp[2:]).decode("unicode-escape") + return ("\\U000" + inp[2:]).decode("unicode-escape") + def make_tests_json(entities): test_list = make_test_list(entities) tests_json = {"tests": - [make_test(*item) for item in test_list] + [make_test(*item) for item in test_list] } return tests_json + def make_test(name, characters, good): return { - "description":test_description(name, good), - "input":"&%s"%name, - "output":test_expected(name, characters, good) - } + "description": test_description(name, good), + "input": "&%s" % name, + "output": test_expected(name, characters, good) + } + def test_description(name, good): with_semicolon = name.endswith(";") - semicolon_text = {True:"with a semi-colon", - False:"without a semi-colon"}[with_semicolon] + semicolon_text = {True: "with a semi-colon", + False: "without a semi-colon"}[with_semicolon] if good: - text = "Named entity: %s %s"%(name, semicolon_text) + text = "Named entity: %s %s" % (name, semicolon_text) else: - text = "Bad named entity: %s %s"%(name, semicolon_text) + text = "Bad named entity: %s %s" % (name, semicolon_text) return text + def test_expected(name, characters, good): rv = [] if not good or not name.endswith(";"): @@ -53,6 +62,7 @@ def test_expected(name, characters, good): rv.append(["Character", characters]) return rv + def make_test_list(entities): tests = [] for entity_name, characters in entities.items(): @@ -61,20 +71,23 @@ def make_test_list(entities): tests.append((entity_name, characters, True)) return sorted(tests) + def subentity_exists(entity_name, entities): for i in range(1, len(entity_name)): if entity_name[:-i] in entities: return True return False + def make_entities_code(entities): - entities_text = "\n".join(" \"%s\": u\"%s\","%( - name, entities[name].encode( - "unicode-escape").replace("\"", "\\\"")) - for name in sorted(entities.keys())) + entities_text = "\n".join(" \"%s\": u\"%s\"," % ( + name, entities[name].encode( + "unicode-escape").replace("\"", "\\\"")) + for name in sorted(entities.keys())) return """entities = { %s -}"""%entities_text +}""" % entities_text + def main(): entities = entity_table(parse()) @@ -85,4 +98,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/utils/spider.py b/utils/spider.py index ac5f9fbe..3a325888 100644 --- a/utils/spider.py +++ b/utils/spider.py @@ -7,7 +7,9 @@ s.spider("http://www.google.com", maxURLs=100) """ -import urllib.request, urllib.error, urllib.parse +import urllib.request +import urllib.error +import urllib.parse import urllib.robotparser import md5 @@ -16,11 +18,13 @@ import html5lib from html5lib.treebuilders import etree + class Spider(object): + def __init__(self): self.unvisitedURLs = set() self.visitedURLs = set() - self.buggyURLs=set() + self.buggyURLs = set() self.robotParser = urllib.robotparser.RobotFileParser() self.contentDigest = {} self.http = httplib2.Http(".cache") @@ -70,18 +74,18 @@ def updateURLs(self, tree): update the list of visited and unvisited URLs according to whether we have seen them before or not""" urls = set() - #Remove all links we have already visited + # Remove all links we have already visited for link in tree.findall(".//a"): - try: - url = urllib.parse.urldefrag(link.attrib['href'])[0] - if (url and url not in self.unvisitedURLs and url + try: + url = urllib.parse.urldefrag(link.attrib['href'])[0] + if (url and url not in self.unvisitedURLs and url not in self.visitedURLs): - urls.add(url) - except KeyError: - pass + urls.add(url) + except KeyError: + pass - #Remove all non-http URLs and add a suitable base URL where that is - #missing + # Remove all non-http URLs and add a suitable base URL where that is + # missing newUrls = set() for url in urls: splitURL = list(urllib.parse.urlsplit(url)) @@ -93,23 +97,22 @@ def updateURLs(self, tree): urls = newUrls responseHeaders = {} - #Now we want to find the content types of the links we haven't visited + # Now we want to find the content types of the links we haven't visited for url in urls: try: resp, content = self.http.request(url, "HEAD") responseHeaders[url] = resp - except AttributeError as KeyError: - #Don't know why this happens + except AttributeError: + # Don't know why this happens pass - - #Remove links not of content-type html or pages not found - #XXX - need to deal with other status codes? + # Remove links not of content-type html or pages not found + # XXX - need to deal with other status codes? toVisit = set([url for url in urls if url in responseHeaders and - "html" in responseHeaders[url]['content-type'] and - responseHeaders[url]['status'] == "200"]) + "html" in responseHeaders[url]['content-type'] and + responseHeaders[url]['status'] == "200"]) - #Now check we are allowed to spider the page + # Now check we are allowed to spider the page for url in toVisit: robotURL = list(urllib.parse.urlsplit(url)[:2]) robotURL.extend(["robots.txt", "", ""])