diff --git a/.prospector.yaml b/.prospector.yaml new file mode 100644 index 00000000..7e8efe1a --- /dev/null +++ b/.prospector.yaml @@ -0,0 +1,21 @@ +strictness: veryhigh +doc-warnings: false +test-warnings: false + +max-line-length: 139 + +requirements: + - requirements.txt + - requirements-test.txt + - requirements-optional.txt + +ignore-paths: + - parse.py + - utils/ + +python-targets: + - 2 + - 3 + +mccabe: + run: false diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..ea74d5db --- /dev/null +++ b/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +ignore=tests + +[MESSAGES CONTROL] +# messages up to fixme should probably be fixed somehow +disable = redefined-builtin,attribute-defined-outside-init,anomalous-backslash-in-string,no-self-use,redefined-outer-name,bad-continuation,wrong-import-order,superfluous-parens,no-member,duplicate-code,super-init-not-called,abstract-method,property-on-old-class,wrong-import-position,no-name-in-module,no-init,bad-mcs-classmethod-argument,bad-classmethod-argument,fixme,invalid-name,import-error,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-boolean-expressions,too-many-branches,too-many-instance-attributes,too-many-locals,too-many-lines,too-many-public-methods,too-many-return-statements,too-many-statements,missing-docstring,line-too-long,locally-disabled,locally-enabled,bad-builtin,deprecated-lambda + +[FORMAT] +max-line-length=139 +single-line-if-stmt=no diff --git a/flake8-run.sh b/flake8-run.sh index 685ec6ab..d9264946 100755 --- a/flake8-run.sh +++ b/flake8-run.sh @@ -5,8 +5,5 @@ if [[ ! -x $(which flake8) ]]; then exit 1 fi -find html5lib/ -name '*.py' -and -not -name 'constants.py' -print0 | xargs -0 flake8 --ignore=E501 -flake1=$? -flake8 --max-line-length=99 --ignore=E126 html5lib/constants.py -flake2=$? -exit $[$flake1 || $flake2] +flake8 `dirname $0` +exit $? diff --git a/html5lib/constants.py b/html5lib/constants.py index 2244933c..df1f061e 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -2819,7 +2819,6 @@ 0x0d: "\u000D", 0x80: "\u20AC", 0x81: "\u0081", - 0x81: "\u0081", 0x82: "\u201A", 0x83: "\u0192", 0x84: "\u201E", diff --git a/html5lib/filters/sanitizer.py b/html5lib/filters/sanitizer.py index caddd318..7f81c0d1 100644 --- a/html5lib/filters/sanitizer.py +++ b/html5lib/filters/sanitizer.py @@ -765,15 +765,15 @@ def sanitize_token(self, token): if ((namespace, name) in self.allowed_elements or (namespace is None and (namespaces["html"], name) in self.allowed_elements)): - return self.allowed_token(token, token_type) + return self.allowed_token(token) else: - return self.disallowed_token(token, token_type) + return self.disallowed_token(token) elif token_type == "Comment": pass else: return token - def allowed_token(self, token, token_type): + def allowed_token(self, token): if "data" in token: attrs = token["data"] attr_names = set(attrs.keys()) @@ -823,7 +823,8 @@ def allowed_token(self, token, token_type): token["data"] = attrs return token - def disallowed_token(self, token, token_type): + def disallowed_token(self, token): + token_type = token["type"] if token_type == "EndTag": token["data"] = "" % token["name"] elif token["data"]: @@ -862,7 +863,7 @@ def sanitize_css(self, style): 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): + not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index e6808425..331b8fd7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -121,7 +121,7 @@ def reset(self): self.phase.insertHtmlElement() self.resetInsertionMode() else: - self.innerHTML = False + self.innerHTML = False # pylint:disable=redefined-variable-type self.phase = self.phases["initial"] self.lastPhase = None @@ -241,6 +241,7 @@ def parse(self, stream, encoding=None, parseMeta=True, def parseFragment(self, stream, container="div", encoding=None, parseMeta=False, useChardet=True, scripting=False): + # pylint:disable=unused-argument """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property @@ -259,8 +260,10 @@ def parseFragment(self, stream, container="div", encoding=None, encoding=encoding, scripting=scripting) return self.tree.getFragment() - def parseError(self, errorcode="XXX-undefined-error", datavars={}): + def parseError(self, errorcode="XXX-undefined-error", datavars=None): # XXX The idea is to make errorcode mandatory. + if datavars is None: + datavars = {} self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: raise ParseError(E[errorcode] % datavars) @@ -361,6 +364,7 @@ def adjustForeignAttributes(self, token): del token["data"][originalName] def reparseTokenNormal(self, token): + # pylint:disable=unused-argument self.parser.phase() def resetInsertionMode(self): @@ -458,6 +462,7 @@ def getMetaclass(use_metaclass, metaclass_func): else: return type + # pylint:disable=unused-argument class Phase(with_metaclass(getMetaclass(debug, log))): """Base class for helper object that implements each phase of processing """ @@ -948,8 +953,8 @@ class InBodyPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) - # Keep a ref to this for special handling of whitespace in
-            self.processSpaceCharactersNonPre = self.processSpaceCharacters
+            # Set this to the default handler
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
 
             self.startTagHandler = utils.MethodDispatcher([
                 ("html", self.startTagHtml),
@@ -1082,7 +1087,7 @@ def processCharacters(self, token):
                      for char in token["data"]])):
                 self.parser.framesetOK = False
 
-        def processSpaceCharacters(self, token):
+        def processSpaceCharactersNonPre(self, token):
             self.tree.reconstructActiveFormattingElements()
             self.tree.insertText(token["data"])
 
@@ -2763,6 +2768,7 @@ def startTagOther(self, token):
         def processEndTag(self, token):
             self.parser.parseError("expected-eof-but-got-end-tag",
                                    {"name": token["name"]})
+    # pylint:enable=unused-argument
 
     return {
         "initial": InitialPhase,
diff --git a/html5lib/ihatexml.py b/html5lib/ihatexml.py
index 5da5d938..d6d1d6fb 100644
--- a/html5lib/ihatexml.py
+++ b/html5lib/ihatexml.py
@@ -175,9 +175,9 @@ def escapeRegexp(string):
     return string
 
 # output from the above
-nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
 
-nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
 
 # Simpler things
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
@@ -186,7 +186,7 @@ def escapeRegexp(string):
 class InfosetFilter(object):
     replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
 
-    def __init__(self, replaceChars=None,
+    def __init__(self,
                  dropXmlnsLocalName=False,
                  dropXmlnsAttrNs=False,
                  preventDoubleDashComments=False,
@@ -217,7 +217,7 @@ def coerceAttribute(self, name, namespace=None):
         else:
             return self.toXmlName(name)
 
-    def coerceElement(self, name, namespace=None):
+    def coerceElement(self, name):
         return self.toXmlName(name)
 
     def coerceComment(self, data):
@@ -232,7 +232,7 @@ def coerceComment(self, data):
 
     def coerceCharacters(self, data):
         if self.replaceFormFeedCharacters:
-            for i in range(data.count("\x0C")):
+            for _ in range(data.count("\x0C")):
                 warnings.warn("Text cannot contain U+000C", DataLossWarning)
             data = data.replace("\x0C", " ")
         # Other non-xml characters
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 15acba0d..58d626c9 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -19,12 +19,6 @@
 except ImportError:
     BytesIO = StringIO
 
-try:
-    from io import BufferedIOBase
-except ImportError:
-    class BufferedIOBase(object):
-        pass
-
 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@@ -32,15 +26,17 @@ class BufferedIOBase(object):
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 
 
-invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
 
 if utils.supports_lone_surrogates:
     # Use one extra step of indirection and create surrogates with
-    # unichr. Not using this indirection would introduce an illegal
+    # eval. Not using this indirection would introduce an illegal
     # unicode literal on platforms not supporting such lone
     # surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
-                                    eval('"\\uD800-\\uDFFF"'))
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
+                                    "]")
 else:
     invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
 
@@ -296,7 +292,7 @@ def readChunk(self, chunkSize=None):
         return True
 
     def characterErrorsUCS4(self, data):
-        for i in range(len(invalid_unicode_re.findall(data))):
+        for _ in range(len(invalid_unicode_re.findall(data))):
             self.errors.append("invalid-codepoint")
 
     def characterErrorsUCS2(self, data):
@@ -453,7 +449,7 @@ def openStream(self, source):
 
         try:
             stream.seek(stream.tell())
-        except:
+        except:  # pylint:disable=bare-except
             stream = BufferedStream(stream)
 
         return stream
@@ -571,6 +567,7 @@ def __new__(self, value):
         return bytes.__new__(self, value.lower())
 
     def __init__(self, value):
+        # pylint:disable=unused-argument
         self._position = -1
 
     def __iter__(self):
@@ -681,7 +678,7 @@ def getEncoding(self):
             (b" 1) or
-                (not is_ucs4 and len(v) > 2)):
-            continue
-        if v != "&":
-            if len(v) == 2:
-                v = utils.surrogatePairToCodepoint(v)
-            else:
-                v = ord(v)
-            if v not in encode_entity_map or k.islower():
-                # prefer < over < and similarly for &, >, etc.
-                encode_entity_map[v] = k
-
-    def htmlentityreplace_errors(exc):
-        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
-            res = []
-            codepoints = []
-            skip = False
-            for i, c in enumerate(exc.object[exc.start:exc.end]):
-                if skip:
-                    skip = False
-                    continue
-                index = i + exc.start
-                if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
-                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
-                    skip = True
-                else:
-                    codepoint = ord(c)
-                codepoints.append(codepoint)
-            for cp in codepoints:
-                e = encode_entity_map.get(cp)
-                if e:
-                    res.append("&")
-                    res.append(e)
-                    if not e.endswith(";"):
-                        res.append(";")
-                else:
-                    res.append("&#x%s;" % (hex(cp)[2:]))
-            return ("".join(res), exc.end)
-        else:
-            return xmlcharrefreplace_errors(exc)
 
-    register_error(unicode_encode_errors, htmlentityreplace_errors)
+encode_entity_map = {}
+is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+    # skip multi-character entities
+    if ((is_ucs4 and len(v) > 1) or
+            (not is_ucs4 and len(v) > 2)):
+        continue
+    if v != "&":
+        if len(v) == 2:
+            v = utils.surrogatePairToCodepoint(v)
+        else:
+            v = ord(v)
+        if v not in encode_entity_map or k.islower():
+            # prefer < over < and similarly for &, >, etc.
+            encode_entity_map[v] = k
+
+
+def htmlentityreplace_errors(exc):
+    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+        res = []
+        codepoints = []
+        skip = False
+        for i, c in enumerate(exc.object[exc.start:exc.end]):
+            if skip:
+                skip = False
+                continue
+            index = i + exc.start
+            if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+                skip = True
+            else:
+                codepoint = ord(c)
+            codepoints.append(codepoint)
+        for cp in codepoints:
+            e = encode_entity_map.get(cp)
+            if e:
+                res.append("&")
+                res.append(e)
+                if not e.endswith(";"):
+                    res.append(";")
+            else:
+                res.append("&#x%s;" % (hex(cp)[2:]))
+        return ("".join(res), exc.end)
+    else:
+        return xmlcharrefreplace_errors(exc)
 
-    del register_error
+register_error("htmlentityreplace", htmlentityreplace_errors)
 
 
 class HTMLSerializer(object):
@@ -168,7 +163,7 @@ def __init__(self, **kwargs):
     def encode(self, string):
         assert(isinstance(string, text_type))
         if self.encoding:
-            return string.encode(self.encoding, unicode_encode_errors)
+            return string.encode(self.encoding, "htmlentityreplace")
         else:
             return string
 
@@ -180,6 +175,7 @@ def encodeStrict(self, string):
             return string
 
     def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
         self.encoding = encoding
         in_cdata = False
         self.errors = []
@@ -241,7 +237,7 @@ def serialize(self, treewalker, encoding=None):
                     in_cdata = True
                 elif in_cdata:
                     self.serializeError("Unexpected child element of a CDATA element")
-                for (attr_namespace, attr_name), attr_value in token["data"].items():
+                for (_, attr_name), attr_value in token["data"].items():
                     # TODO: Add namespace support here
                     k = attr_name
                     v = attr_value
@@ -328,6 +324,6 @@ def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
             raise SerializeError
 
 
-def SerializeError(Exception):
+class SerializeError(Exception):
     """Error in serialized tree"""
     pass
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
index 6e6a916b..6ae09dbe 100644
--- a/html5lib/tests/support.py
+++ b/html5lib/tests/support.py
@@ -1,5 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals
 
+# pylint:disable=wrong-import-position
+
 import os
 import sys
 import codecs
@@ -13,7 +15,7 @@
                                                 os.path.pardir,
                                                 os.path.pardir)))
 
-from html5lib import treebuilders, treewalkers, treeadapters
+from html5lib import treebuilders, treewalkers, treeadapters  # noqa
 del base_path
 
 # Build a dict of available trees
@@ -26,14 +28,14 @@
 }
 
 # ElementTree impls
-import xml.etree.ElementTree as ElementTree
+import xml.etree.ElementTree as ElementTree  # noqa
 treeTypes['ElementTree'] = {
     "builder": treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True),
     "walker": treewalkers.getTreeWalker("etree", ElementTree)
 }
 
 try:
-    import xml.etree.cElementTree as cElementTree
+    import xml.etree.cElementTree as cElementTree  # noqa
 except ImportError:
     treeTypes['cElementTree'] = None
 else:
@@ -47,7 +49,7 @@
         }
 
 try:
-    import lxml.etree as lxml  # flake8: noqa
+    import lxml.etree as lxml  # noqa
 except ImportError:
     treeTypes['lxml'] = None
 else:
@@ -58,7 +60,7 @@
 
 # Genshi impls
 try:
-    import genshi  # flake8: noqa
+    import genshi  # noqa
 except ImportError:
     pass
 else:
@@ -68,6 +70,8 @@
         "walker": treewalkers.getTreeWalker("genshi")
     }
 
+# pylint:enable=wrong-import-position
+
 
 def get_data_files(subdirectory, files='*.dat', search_dir=test_dir):
     return sorted(glob.glob(os.path.join(search_dir, subdirectory, files)))
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
index 09504654..c5d2af12 100644
--- a/html5lib/tests/test_encoding.py
+++ b/html5lib/tests/test_encoding.py
@@ -51,19 +51,21 @@ def runPreScanEncodingTest(data, encoding):
 def test_encoding():
     for filename in get_data_files("encoding"):
         tests = _TestData(filename, b"data", encoding=None)
-        for idx, test in enumerate(tests):
+        for test in tests:
             yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
             yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
 
+# pylint:disable=wrong-import-position
 try:
     try:
-        import charade  # flake8: noqa
+        import charade  # noqa
     except ImportError:
-        import chardet  # flake8: noqa
+        import chardet  # noqa
 except ImportError:
     print("charade/chardet not found, skipping chardet tests")
 else:
     def test_chardet():
-        with open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt"), "rb") as fp:
+        with open(os.path.join(test_dir, "encoding", "chardet", "test_big5.txt"), "rb") as fp:
             encoding = inputstream.HTMLInputStream(fp.read()).charEncoding
             assert encoding[0].name == "big5"
+# pylint:enable=wrong-import-position
diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
index 2f3ba2c8..f8e1ac43 100644
--- a/html5lib/tests/test_parser2.py
+++ b/html5lib/tests/test_parser2.py
@@ -2,10 +2,8 @@
 
 import io
 
-import pytest
+from . import support  # noqa
 
-from . import support  # flake8: noqa
-from html5lib import html5parser
 from html5lib.constants import namespaces
 from html5lib import parse
 
@@ -23,29 +21,29 @@ def test_line_counter():
 
 def test_namespace_html_elements_0_dom():
     doc = parse("",
-                         treebuilder="dom",
-                         namespaceHTMLElements=True)
+                treebuilder="dom",
+                namespaceHTMLElements=True)
     assert doc.childNodes[0].namespaceURI == namespaces["html"]
 
 
 def test_namespace_html_elements_1_dom():
     doc = parse("",
-                         treebuilder="dom",
-                         namespaceHTMLElements=False)
+                treebuilder="dom",
+                namespaceHTMLElements=False)
     assert doc.childNodes[0].namespaceURI is None
 
 
 def test_namespace_html_elements_0_etree():
     doc = parse("",
-                         treebuilder="etree",
-                         namespaceHTMLElements=True)
+                treebuilder="etree",
+                namespaceHTMLElements=True)
     assert doc.tag == "{%s}html" % (namespaces["html"],)
 
 
 def test_namespace_html_elements_1_etree():
     doc = parse("",
-                         treebuilder="etree",
-                         namespaceHTMLElements=False)
+                treebuilder="etree",
+                namespaceHTMLElements=False)
     assert doc.tag == "html"
 
 
diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
index 1f8a06f6..e19deea8 100644
--- a/html5lib/tests/test_sanitizer.py
+++ b/html5lib/tests/test_sanitizer.py
@@ -4,7 +4,7 @@
 from html5lib.filters import sanitizer
 
 
-def runSanitizerTest(name, expected, input):
+def runSanitizerTest(_, expected, input):
     parsed = parseFragment(expected)
     expected = serialize(parsed,
                          omit_optional_tags=False,
@@ -63,7 +63,8 @@ def test_sanitizer():
     for ns, tag_name in sanitizer.allowed_elements:
         if ns != constants.namespaces["html"]:
             continue
-        if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'select']:
+        if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td',
+                        'tfoot', 'th', 'thead', 'tr', 'select']:
             continue  # TODO
         if tag_name == 'image':
             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py
index b3ffe0df..b3cda7d7 100644
--- a/html5lib/tests/test_serializer.py
+++ b/html5lib/tests/test_serializer.py
@@ -12,6 +12,7 @@
 from html5lib.serializer import HTMLSerializer, serialize
 from html5lib.treewalkers._base import TreeWalker
 
+# pylint:disable=wrong-import-position
 optionals_loaded = []
 
 try:
@@ -19,6 +20,7 @@
     optionals_loaded.append("lxml")
 except ImportError:
     pass
+# pylint:enable=wrong-import-position
 
 default_namespace = constants.namespaces["html"]
 
@@ -219,5 +221,5 @@ def test_serializer():
     for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
         with open(filename) as fp:
             tests = json.load(fp)
-            for index, test in enumerate(tests['tests']):
+            for test in tests['tests']:
                 yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
diff --git a/html5lib/tests/test_stream.py b/html5lib/tests/test_stream.py
index 3b659fbb..77e411d5 100644
--- a/html5lib/tests/test_stream.py
+++ b/html5lib/tests/test_stream.py
@@ -1,15 +1,20 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from . import support  # flake8: noqa
+from . import support  # noqa
+
 import codecs
-from io import BytesIO
-import socket
+import sys
+from io import BytesIO, StringIO
+
+import pytest
 
 import six
 from six.moves import http_client, urllib
 
 from html5lib.inputstream import (BufferedStream, HTMLInputStream,
                                   HTMLUnicodeInputStream, HTMLBinaryInputStream)
+from html5lib.utils import supports_lone_surrogates
+
 
 def test_basic():
     s = b"abc"
@@ -17,6 +22,7 @@ def test_basic():
     read = fp.read(10)
     assert read == s
 
+
 def test_read_length():
     fp = BufferedStream(BytesIO(b"abcdef"))
     read1 = fp.read(1)
@@ -28,17 +34,23 @@ def test_read_length():
     read4 = fp.read(4)
     assert read4 == b""
 
+
 def test_tell():
     fp = BufferedStream(BytesIO(b"abcdef"))
     read1 = fp.read(1)
+    assert read1 == b"a"
     assert fp.tell() == 1
     read2 = fp.read(2)
+    assert read2 == b"bc"
     assert fp.tell() == 3
     read3 = fp.read(3)
+    assert read3 == b"def"
     assert fp.tell() == 6
     read4 = fp.read(4)
+    assert read4 == b""
     assert fp.tell() == 6
 
+
 def test_seek():
     fp = BufferedStream(BytesIO(b"abcdef"))
     read1 = fp.read(1)
@@ -55,20 +67,26 @@ def test_seek():
     read5 = fp.read(2)
     assert read5 == b"ef"
 
+
 def test_seek_tell():
     fp = BufferedStream(BytesIO(b"abcdef"))
     read1 = fp.read(1)
+    assert read1 == b"a"
     assert fp.tell() == 1
     fp.seek(0)
     read2 = fp.read(1)
+    assert read2 == b"a"
     assert fp.tell() == 1
     read3 = fp.read(2)
+    assert read3 == b"bc"
     assert fp.tell() == 3
     fp.seek(2)
     read4 = fp.read(2)
+    assert read4 == b"cd"
     assert fp.tell() == 4
     fp.seek(4)
     read5 = fp.read(2)
+    assert read5 == b"ef"
     assert fp.tell() == 6
 
 
@@ -85,11 +103,13 @@ def test_char_ascii():
     assert stream.charEncoding[0].name == 'windows-1252'
     assert stream.char() == "'"
 
+
 def test_char_utf8():
     stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
     assert stream.charEncoding[0].name == 'utf-8'
     assert stream.char() == '\u2018'
 
+
 def test_char_win1252():
     stream = HTMLInputStream("\xa9\xf1\u2019".encode('windows-1252'))
     assert stream.charEncoding[0].name == 'windows-1252'
@@ -97,16 +117,19 @@ def test_char_win1252():
     assert stream.char() == "\xf1"
     assert stream.char() == "\u2019"
 
+
 def test_bom():
     stream = HTMLInputStream(codecs.BOM_UTF8 + b"'")
     assert stream.charEncoding[0].name == 'utf-8'
     assert stream.char() == "'"
 
+
 def test_utf_16():
     stream = HTMLInputStream((' ' * 1025).encode('utf-16'))
     assert stream.charEncoding[0].name in ['utf-16le', 'utf-16be']
     assert len(stream.charsUntil(' ', True)) == 1025
 
+
 def test_newlines():
     stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\r\nccc\rddddxe")
     assert stream.position() == (1, 0)
@@ -117,11 +140,13 @@ def test_newlines():
     assert stream.charsUntil('e') == "x"
     assert stream.position() == (4, 5)
 
+
 def test_newlines2():
     size = HTMLUnicodeInputStream._defaultChunkSize
     stream = HTMLInputStream("\r" * size + "\n")
     assert stream.charsUntil('x') == "\n" * size
 
+
 def test_position():
     stream = HTMLBinaryInputStreamShortChunk(codecs.BOM_UTF8 + b"a\nbb\nccc\nddde\nf\ngh")
     assert stream.position() == (1, 0)
@@ -140,6 +165,7 @@ def test_position():
     assert stream.charsUntil('h') == "e\nf\ng"
     assert stream.position() == (6, 1)
 
+
 def test_position2():
     stream = HTMLUnicodeInputStreamShortChunk("abc\nd")
     assert stream.position() == (1, 0)
@@ -154,6 +180,7 @@ def test_position2():
     assert stream.char() == "d"
     assert stream.position() == (2, 1)
 
+
 def test_python_issue_20007():
     """
     Make sure we have a work-around for Python bug #20007
@@ -161,6 +188,7 @@ def test_python_issue_20007():
     """
     class FakeSocket(object):
         def makefile(self, _mode, _bufsize=None):
+            # pylint:disable=unused-argument
             return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
 
     source = http_client.HTTPResponse(FakeSocket())
@@ -168,6 +196,7 @@ def makefile(self, _mode, _bufsize=None):
     stream = HTMLInputStream(source)
     assert stream.charsUntil(" ") == "Text"
 
+
 def test_python_issue_20007_b():
     """
     Make sure we have a work-around for Python bug #20007
@@ -178,6 +207,7 @@ def test_python_issue_20007_b():
 
     class FakeSocket(object):
         def makefile(self, _mode, _bufsize=None):
+            # pylint:disable=unused-argument
             return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
 
     source = http_client.HTTPResponse(FakeSocket())
@@ -185,3 +215,109 @@ def makefile(self, _mode, _bufsize=None):
     wrapped = urllib.response.addinfourl(source, source.msg, "http://example.com")
     stream = HTMLInputStream(wrapped)
     assert stream.charsUntil(" ") == "Text"
+
+
+@pytest.mark.parametrize("inp,num",
+                         [("\u0000", 0),
+                          ("\u0001", 1),
+                          ("\u0008", 1),
+                          ("\u0009", 0),
+                          ("\u000A", 0),
+                          ("\u000B", 1),
+                          ("\u000C", 0),
+                          ("\u000D", 0),
+                          ("\u000E", 1),
+                          ("\u001F", 1),
+                          ("\u0020", 0),
+                          ("\u007E", 0),
+                          ("\u007F", 1),
+                          ("\u009F", 1),
+                          ("\u00A0", 0),
+                          ("\uFDCF", 0),
+                          ("\uFDD0", 1),
+                          ("\uFDEF", 1),
+                          ("\uFDF0", 0),
+                          ("\uFFFD", 0),
+                          ("\uFFFE", 1),
+                          ("\uFFFF", 1),
+                          ("\U0001FFFD", 0),
+                          ("\U0001FFFE", 1),
+                          ("\U0001FFFF", 1),
+                          ("\U0002FFFD", 0),
+                          ("\U0002FFFE", 1),
+                          ("\U0002FFFF", 1),
+                          ("\U0003FFFD", 0),
+                          ("\U0003FFFE", 1),
+                          ("\U0003FFFF", 1),
+                          ("\U0004FFFD", 0),
+                          ("\U0004FFFE", 1),
+                          ("\U0004FFFF", 1),
+                          ("\U0005FFFD", 0),
+                          ("\U0005FFFE", 1),
+                          ("\U0005FFFF", 1),
+                          ("\U0006FFFD", 0),
+                          ("\U0006FFFE", 1),
+                          ("\U0006FFFF", 1),
+                          ("\U0007FFFD", 0),
+                          ("\U0007FFFE", 1),
+                          ("\U0007FFFF", 1),
+                          ("\U0008FFFD", 0),
+                          ("\U0008FFFE", 1),
+                          ("\U0008FFFF", 1),
+                          ("\U0009FFFD", 0),
+                          ("\U0009FFFE", 1),
+                          ("\U0009FFFF", 1),
+                          ("\U000AFFFD", 0),
+                          ("\U000AFFFE", 1),
+                          ("\U000AFFFF", 1),
+                          ("\U000BFFFD", 0),
+                          ("\U000BFFFE", 1),
+                          ("\U000BFFFF", 1),
+                          ("\U000CFFFD", 0),
+                          ("\U000CFFFE", 1),
+                          ("\U000CFFFF", 1),
+                          ("\U000DFFFD", 0),
+                          ("\U000DFFFE", 1),
+                          ("\U000DFFFF", 1),
+                          ("\U000EFFFD", 0),
+                          ("\U000EFFFE", 1),
+                          ("\U000EFFFF", 1),
+                          ("\U000FFFFD", 0),
+                          ("\U000FFFFE", 1),
+                          ("\U000FFFFF", 1),
+                          ("\U0010FFFD", 0),
+                          ("\U0010FFFE", 1),
+                          ("\U0010FFFF", 1),
+                          ("\x01\x01\x01", 3),
+                          ("a\x01a\x01a\x01a", 3)])
+def test_invalid_codepoints(inp, num):
+    stream = HTMLUnicodeInputStream(StringIO(inp))
+    for _i in range(len(inp)):
+        stream.char()
+    assert len(stream.errors) == num
+
+
+@pytest.mark.skipif(not supports_lone_surrogates, reason="doesn't support lone surrogates")
+@pytest.mark.parametrize("inp,num",
+                         [("'\\uD7FF'", 0),
+                          ("'\\uD800'", 1),
+                          ("'\\uDBFF'", 1),
+                          ("'\\uDC00'", 1),
+                          ("'\\uDFFF'", 1),
+                          ("'\\uE000'", 0),
+                          ("'\\uD800\\uD800\\uD800'", 3),
+                          ("'a\\uD800a\\uD800a\\uD800a'", 3),
+                          ("'\\uDFFF\\uDBFF'", 2),
+                          pytest.mark.skipif(sys.maxunicode == 0xFFFF,
+                                             ("'\\uDBFF\\uDFFF'", 2),
+                                             reason="narrow Python")])
+def test_invalid_codepoints_surrogates(inp, num):
+    inp = eval(inp)  # pylint:disable=eval-used
+    fp = StringIO(inp)
+    if ord(max(fp.read())) > 0xFFFF:
+        pytest.skip("StringIO altered string")
+    fp.seek(0)
+    stream = HTMLUnicodeInputStream(fp)
+    for _i in range(len(inp)):
+        stream.char()
+    assert len(stream.errors) == num
diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py
index 5f38b6c3..95e56c00 100644
--- a/html5lib/tests/test_treeadapters.py
+++ b/html5lib/tests/test_treeadapters.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from . import support  # flake8: noqa
+from . import support  # noqa
 
 import html5lib
 from html5lib.treeadapters import sax
@@ -25,7 +25,7 @@ def test_to_sax():
         ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
         ('characters', '\n        '),
         ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
-        ('startElementNS',  ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
+        ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
         ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
         ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
         ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 045d9d7b..332027ac 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -31,7 +31,7 @@ def test_all_tokens():
         {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
         {'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
     ]
-    for treeName, treeCls in sorted(treeTypes.items()):
+    for _, treeCls in sorted(treeTypes.items()):
         if treeCls is None:
             continue
         p = html5parser.HTMLParser(tree=treeCls["builder"])
diff --git a/html5lib/tests/tokenizer.py b/html5lib/tests/tokenizer.py
index c6163a1f..255c1859 100644
--- a/html5lib/tests/tokenizer.py
+++ b/html5lib/tests/tokenizer.py
@@ -19,6 +19,7 @@ def __init__(self, initialState, lastStartTag=None):
         self._lastStartTag = lastStartTag
 
     def parse(self, stream, encoding=None, innerHTML=False):
+        # pylint:disable=unused-argument
         tokenizer = self.tokenizer(stream, encoding)
         self.outputTokens = []
 
diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py
index 79774578..dd6ea75f 100644
--- a/html5lib/tokenizer.py
+++ b/html5lib/tokenizer.py
@@ -1,9 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
-try:
-    chr = unichr # flake8: noqa
-except NameError:
-    pass
+from six import unichr as chr
 
 from collections import deque
 
@@ -147,8 +144,8 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
         output = "&"
 
         charStack = [self.stream.char()]
-        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
-                or (allowedChar is not None and allowedChar == charStack[0])):
+        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
+                (allowedChar is not None and allowedChar == charStack[0])):
             self.stream.unget(charStack[0])
 
         elif charStack[0] == "#":
@@ -924,7 +921,7 @@ def attributeNameState(self):
             if self.lowercaseAttrName:
                 self.currentToken["data"][-1][0] = (
                     self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
-            for name, value in self.currentToken["data"][:-1]:
+            for name, _ in self.currentToken["data"][:-1]:
                 if self.currentToken["data"][-1][0] == name:
                     self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
                                             "duplicate-attribute"})
@@ -1716,11 +1713,11 @@ def cdataSectionState(self):
                 else:
                     data.append(char)
 
-        data = "".join(data)
+        data = "".join(data)  # pylint:disable=redefined-variable-type
         # Deal with null here rather than in the parser
         nullCount = data.count("\u0000")
         if nullCount > 0:
-            for i in range(nullCount):
+            for _ in range(nullCount):
                 self.tokenQueue.append({"type": tokenTypes["ParseError"],
                                         "data": "invalid-codepoint"})
             data = data.replace("\u0000", "\uFFFD")
diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py
index 57d71304..4f978466 100644
--- a/html5lib/treeadapters/__init__.py
+++ b/html5lib/treeadapters/__init__.py
@@ -5,7 +5,7 @@
 __all__ = ["sax"]
 
 try:
-    from . import genshi  # flake8: noqa
+    from . import genshi  # noqa
 except ImportError:
     pass
 else:
diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py
index 8196f591..900a724c 100644
--- a/html5lib/treebuilders/_base.py
+++ b/html5lib/treebuilders/_base.py
@@ -126,6 +126,7 @@ class TreeBuilder(object):
     commentClass - the class to use for comments
     doctypeClass - the class to use for doctypes
     """
+    # pylint:disable=not-callable
 
     # Document class
     documentClass = None
diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
index 8656244f..b7df74b2 100644
--- a/html5lib/treebuilders/dom.py
+++ b/html5lib/treebuilders/dom.py
@@ -109,7 +109,7 @@ def getNameTuple(self):
 
         nameTuple = property(getNameTuple)
 
-    class TreeBuilder(_base.TreeBuilder):
+    class TreeBuilder(_base.TreeBuilder):  # pylint:disable=unused-variable
         def documentClass(self):
             self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
             return weakref.proxy(self)
@@ -158,6 +158,7 @@ def insertText(self, data, parent=None):
             else:
                 # HACK: allow text nodes as children of the document node
                 if hasattr(self.dom, '_child_node_types'):
+                    # pylint:disable=protected-access
                     if Node.TEXT_NODE not in self.dom._child_node_types:
                         self.dom._child_node_types = list(self.dom._child_node_types)
                         self.dom._child_node_types.append(Node.TEXT_NODE)
diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py
index 2c8ed19f..d394148d 100644
--- a/html5lib/treebuilders/etree.py
+++ b/html5lib/treebuilders/etree.py
@@ -1,4 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
 from six import text_type
 
 import re
@@ -253,7 +255,7 @@ def serializeElement(element, indent=0):
 
         return "\n".join(rv)
 
-    def tostring(element):
+    def tostring(element):  # pylint:disable=unused-variable
         """Serialize an element and its child nodes to a string"""
         rv = []
         filter = ihatexml.InfosetFilter()
@@ -307,7 +309,7 @@ def serializeElement(element):
 
         return "".join(rv)
 
-    class TreeBuilder(_base.TreeBuilder):
+    class TreeBuilder(_base.TreeBuilder):  # pylint:disable=unused-variable
         documentClass = Document
         doctypeClass = DocumentType
         elementClass = Element
diff --git a/html5lib/treebuilders/etree_lxml.py b/html5lib/treebuilders/etree_lxml.py
index 138b30bd..2a69769b 100644
--- a/html5lib/treebuilders/etree_lxml.py
+++ b/html5lib/treebuilders/etree_lxml.py
@@ -10,6 +10,7 @@
 """
 
 from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
 
 import warnings
 import re
@@ -53,7 +54,6 @@ def _getChildNodes(self):
 
 def testSerializer(element):
     rv = []
-    finalText = None
     infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
 
     def serializeElement(element, indent=0):
@@ -128,16 +128,12 @@ def serializeElement(element, indent=0):
                 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
     serializeElement(element, 0)
 
-    if finalText is not None:
-        rv.append("|%s\"%s\"" % (' ' * 2, finalText))
-
     return "\n".join(rv)
 
 
 def tostring(element):
     """Serialize an element and its child nodes to a string"""
     rv = []
-    finalText = None
 
     def serializeElement(element):
         if not hasattr(element, "tag"):
@@ -173,9 +169,6 @@ def serializeElement(element):
 
     serializeElement(element)
 
-    if finalText is not None:
-        rv.append("%s\"" % (' ' * 2, finalText))
-
     return "".join(rv)
 
 
@@ -193,9 +186,11 @@ def __init__(self, namespaceHTMLElements, fullTree=False):
         self.namespaceHTMLElements = namespaceHTMLElements
 
         class Attributes(dict):
-            def __init__(self, element, value={}):
+            def __init__(self, element, value=None):
+                if value is None:
+                    value = {}
                 self._element = element
-                dict.__init__(self, value)
+                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
                 for key, value in self.items():
                     if isinstance(key, tuple):
                         name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
@@ -303,12 +298,14 @@ def insertDoctype(self, token):
             self.doctype = doctype
 
     def insertCommentInitial(self, data, parent=None):
+        assert parent is None or parent is self.document
+        assert self.document._elementTree is None
         self.initial_comments.append(data)
 
     def insertCommentMain(self, data, parent=None):
         if (parent == self.document and
                 self.document._elementTree.getroot()[-1].tag == comment_type):
-                warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
         super(TreeBuilder, self).insertComment(data, parent)
 
     def insertRoot(self, token):
diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py
index 73c8e26a..d3b0c50e 100644
--- a/html5lib/treewalkers/etree.py
+++ b/html5lib/treewalkers/etree.py
@@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
     ElementTree = ElementTreeImplementation
     ElementTreeCommentType = ElementTree.Comment("asd").tag
 
-    class TreeWalker(_base.NonRecursiveTreeWalker):
+    class TreeWalker(_base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
         """Given the particular ElementTree representation, this implementation,
         to avoid using recursion, returns "nodes" as tuples with the following
         content:
@@ -38,7 +38,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
         """
         def getNodeDetails(self, node):
             if isinstance(node, tuple):  # It might be the root Element
-                elt, key, parents, flag = node
+                elt, _, _, flag = node
                 if flag in ("text", "tail"):
                     return _base.TEXT, getattr(elt, flag)
                 else:
diff --git a/html5lib/treewalkers/genshistream.py b/html5lib/treewalkers/genshistream.py
index 83cd1654..61cbfede 100644
--- a/html5lib/treewalkers/genshistream.py
+++ b/html5lib/treewalkers/genshistream.py
@@ -25,7 +25,7 @@ def __iter__(self):
                 yield token
 
     def tokens(self, event, next):
-        kind, data, pos = event
+        kind, data, _ = event
         if kind == START:
             tag, attribs = data
             name = tag.localname
diff --git a/html5lib/treewalkers/lxmletree.py b/html5lib/treewalkers/lxmletree.py
index 36850086..7d99adc2 100644
--- a/html5lib/treewalkers/lxmletree.py
+++ b/html5lib/treewalkers/lxmletree.py
@@ -117,6 +117,7 @@ def __len__(self):
 
 class TreeWalker(_base.NonRecursiveTreeWalker):
     def __init__(self, tree):
+        # pylint:disable=redefined-variable-type
         if hasattr(tree, "getroot"):
             self.fragmentChildren = set()
             tree = Root(tree)
diff --git a/html5lib/trie/__init__.py b/html5lib/trie/__init__.py
index a8cca8a9..a5ba4bf1 100644
--- a/html5lib/trie/__init__.py
+++ b/html5lib/trie/__init__.py
@@ -4,9 +4,11 @@
 
 Trie = PyTrie
 
+# pylint:disable=wrong-import-position
 try:
     from .datrie import Trie as DATrie
 except ImportError:
     pass
 else:
     Trie = DATrie
+# pylint:enable=wrong-import-position
diff --git a/html5lib/trie/_base.py b/html5lib/trie/_base.py
index 724486b1..25eece46 100644
--- a/html5lib/trie/_base.py
+++ b/html5lib/trie/_base.py
@@ -7,7 +7,8 @@ class Trie(Mapping):
     """Abstract base class for tries"""
 
     def keys(self, prefix=None):
-        keys = super().keys()
+        # pylint:disable=arguments-differ
+        keys = super(Trie, self).keys()
 
         if prefix is None:
             return set(keys)
diff --git a/html5lib/utils.py b/html5lib/utils.py
index c70de172..5fe237a0 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -22,12 +22,12 @@
 # surrogates, and there is no mechanism to further escape such
 # escapes.
 try:
-    _x = eval('"\\uD800"')
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
     if not isinstance(_x, text_type):
         # We need this with u"" because of http://bugs.jython.org/issue2039
-        _x = eval('u"\\uD800"')
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
         assert isinstance(_x, text_type)
-except:
+except:  # pylint:disable=bare-except
     supports_lone_surrogates = False
 else:
     supports_lone_surrogates = True
@@ -52,7 +52,7 @@ def __init__(self, items=()):
         # anything here.
         _dictEntries = []
         for name, value in items:
-            if type(name) in (list, tuple, frozenset, set):
+            if isinstance(name, (list, tuple, frozenset, set)):
                 for item in name:
                     _dictEntries.append((item, value))
             else:
diff --git a/parse.py b/parse.py
index cceea84d..2ed8f1c2 100755
--- a/parse.py
+++ b/parse.py
@@ -5,7 +5,6 @@
 """
 
 import sys
-import os
 import traceback
 from optparse import OptionParser
 
@@ -15,9 +14,10 @@
 from html5lib import constants
 from html5lib import utils
 
+
 def parse():
     optParser = getOptParser()
-    opts,args = optParser.parse_args()
+    opts, args = optParser.parse_args()
     encoding = "utf8"
 
     try:
@@ -25,7 +25,10 @@ def parse():
         # Try opening from the internet
         if f.startswith('http://'):
             try:
-                import urllib.request, urllib.parse, urllib.error, cgi
+                import urllib.request
+                import urllib.parse
+                import urllib.error
+                import cgi
                 f = urllib.request.urlopen(f)
                 contentType = f.headers.get('content-type')
                 if contentType:
@@ -41,7 +44,7 @@ def parse():
             try:
                 # Try opening from file system
                 f = open(f, "rb")
-            except IOError as e:                
+            except IOError as e:
                 sys.stderr.write("Unable to open file: %s\n" % e)
                 sys.exit(1)
     except IndexError:
@@ -82,14 +85,15 @@ def parse():
         if document:
             printOutput(p, document, opts)
             t2 = time.time()
-            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
+            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
         else:
-            sys.stderr.write("\n\nRun took: %fs"%(t1-t0))
+            sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
     else:
         document = run(parseMethod, f, encoding, opts.scripting)
         if document:
             printOutput(p, document, opts)
 
+
 def run(parseMethod, f, encoding, scripting):
     try:
         document = parseMethod(f, encoding=encoding, scripting=scripting)
@@ -98,6 +102,7 @@ def run(parseMethod, f, encoding, scripting):
         traceback.print_exc()
     return document
 
+
 def printOutput(parser, document, opts):
     if opts.encoding:
         print("Encoding:", parser.tokenizer.stream.charEncoding)
@@ -116,7 +121,7 @@ def printOutput(parser, document, opts):
             elif tb == "etree":
                 sys.stdout.write(utils.default_etree.tostring(document))
         elif opts.tree:
-            if not hasattr(document,'__getitem__'):
+            if not hasattr(document, '__getitem__'):
                 document = [document]
             for fragment in document:
                 print(parser.tree.testSerializer(fragment))
@@ -126,7 +131,7 @@ def printOutput(parser, document, opts):
             kwargs = {}
             for opt in serializer.HTMLSerializer.options:
                 try:
-                    kwargs[opt] = getattr(opts,opt)
+                    kwargs[opt] = getattr(opts, opt)
                 except:
                     pass
             if not kwargs['quote_char']:
@@ -142,12 +147,14 @@ def printOutput(parser, document, opts):
                 encoding = "utf-8"
             for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
                 sys.stdout.write(text)
-            if not text.endswith('\n'): sys.stdout.write('\n')
+            if not text.endswith('\n'):
+                sys.stdout.write('\n')
     if opts.error:
-        errList=[]
+        errList = []
         for pos, errorcode, datavars in parser.errors:
-            errList.append("Line %i Col %i"%pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
-        sys.stdout.write("\nParse errors:\n" + "\n".join(errList)+"\n")
+            errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
+        sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
+
 
 def getOptParser():
     parser = OptionParser(usage=__doc__)
diff --git a/setup.cfg b/setup.cfg
index 2a9acf13..3152ac54 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,11 @@
 [bdist_wheel]
 universal = 1
+
+[pep8]
+ignore = N
+max-line-length = 139
+exclude = .git,__pycache__,.tox,doc
+
+[flake8]
+ignore = N
+max-line-length = 139
diff --git a/setup.py b/setup.py
index b6ea24af..b42ba400 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 from setuptools import setup
 
 
-classifiers=[
+classifiers = [
     'Development Status :: 5 - Production/Stable',
     'Intended Audience :: Developers',
     'License :: OSI Approved :: MIT License',
@@ -20,9 +20,9 @@
     'Programming Language :: Python :: 3.5',
     'Topic :: Software Development :: Libraries :: Python Modules',
     'Topic :: Text Processing :: Markup :: HTML'
-    ]
+]
 
-packages = ['html5lib'] + ['html5lib.'+name
+packages = ['html5lib'] + ['html5lib.' + name
                            for name in os.listdir(os.path.join('html5lib'))
                            if os.path.isdir(os.path.join('html5lib', name)) and
                            not name.startswith('.') and name != 'tests']
@@ -39,9 +39,9 @@
     assignments = filter(lambda x: isinstance(x, ast.Assign), t.body)
     for a in assignments:
         if (len(a.targets) == 1 and
-              isinstance(a.targets[0], ast.Name) and
-              a.targets[0].id == "__version__" and
-              isinstance(a.value, ast.Str)):
+                isinstance(a.targets[0], ast.Name) and
+                a.targets[0].id == "__version__" and
+                isinstance(a.value, ast.Str)):
             version = a.value.s
 
 setup(name='html5lib',
diff --git a/utils/entities.py b/utils/entities.py
index 116a27cb..6dccf5f0 100644
--- a/utils/entities.py
+++ b/utils/entities.py
@@ -2,50 +2,59 @@
 
 import html5lib
 
+
 def parse(path="html5ents.xml"):
     return html5lib.parse(open(path), treebuilder="lxml")
 
+
 def entity_table(tree):
     return dict((entity_name("".join(tr[0].xpath(".//text()"))),
                  entity_characters(tr[1].text))
                 for tr in tree.xpath("//h:tbody/h:tr",
-                                     namespaces={"h":"http://www.w3.org/1999/xhtml"}))
+                                     namespaces={"h": "http://www.w3.org/1999/xhtml"}))
+
 
 def entity_name(inp):
     return inp.strip()
 
+
 def entity_characters(inp):
     return "".join(codepoint_to_character(item)
-                    for item in inp.split()
-                    if item)
+                   for item in inp.split()
+                   if item)
+
 
 def codepoint_to_character(inp):
-    return ("\U000"+inp[2:]).decode("unicode-escape")
+    return ("\\U000" + inp[2:]).decode("unicode-escape")
+
 
 def make_tests_json(entities):
     test_list = make_test_list(entities)
     tests_json = {"tests":
-                      [make_test(*item) for item in test_list]
+                  [make_test(*item) for item in test_list]
                   }
     return tests_json
 
+
 def make_test(name, characters, good):
     return {
-        "description":test_description(name, good),
-        "input":"&%s"%name,
-        "output":test_expected(name, characters, good)
-        }
+        "description": test_description(name, good),
+        "input": "&%s" % name,
+        "output": test_expected(name, characters, good)
+    }
+
 
 def test_description(name, good):
     with_semicolon = name.endswith(";")
-    semicolon_text = {True:"with a semi-colon",
-                      False:"without a semi-colon"}[with_semicolon]
+    semicolon_text = {True: "with a semi-colon",
+                      False: "without a semi-colon"}[with_semicolon]
     if good:
-        text = "Named entity: %s %s"%(name, semicolon_text)
+        text = "Named entity: %s %s" % (name, semicolon_text)
     else:
-        text = "Bad named entity: %s %s"%(name, semicolon_text)
+        text = "Bad named entity: %s %s" % (name, semicolon_text)
     return text
 
+
 def test_expected(name, characters, good):
     rv = []
     if not good or not name.endswith(";"):
@@ -53,6 +62,7 @@ def test_expected(name, characters, good):
     rv.append(["Character", characters])
     return rv
 
+
 def make_test_list(entities):
     tests = []
     for entity_name, characters in entities.items():
@@ -61,20 +71,23 @@ def make_test_list(entities):
         tests.append((entity_name, characters, True))
     return sorted(tests)
 
+
 def subentity_exists(entity_name, entities):
     for i in range(1, len(entity_name)):
         if entity_name[:-i] in entities:
             return True
     return False
 
+
 def make_entities_code(entities):
-    entities_text = "\n".join("    \"%s\": u\"%s\","%(
-            name, entities[name].encode(
-                "unicode-escape").replace("\"", "\\\""))
-                              for name in sorted(entities.keys()))
+    entities_text = "\n".join("    \"%s\": u\"%s\"," % (
+        name, entities[name].encode(
+            "unicode-escape").replace("\"", "\\\""))
+        for name in sorted(entities.keys()))
     return """entities = {
 %s
-}"""%entities_text
+}""" % entities_text
+
 
 def main():
     entities = entity_table(parse())
@@ -85,4 +98,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-
diff --git a/utils/spider.py b/utils/spider.py
index ac5f9fbe..3a325888 100644
--- a/utils/spider.py
+++ b/utils/spider.py
@@ -7,7 +7,9 @@
 s.spider("http://www.google.com", maxURLs=100)
 """
 
-import urllib.request, urllib.error, urllib.parse
+import urllib.request
+import urllib.error
+import urllib.parse
 import urllib.robotparser
 import md5
 
@@ -16,11 +18,13 @@
 import html5lib
 from html5lib.treebuilders import etree
 
+
 class Spider(object):
+
     def __init__(self):
         self.unvisitedURLs = set()
         self.visitedURLs = set()
-        self.buggyURLs=set()
+        self.buggyURLs = set()
         self.robotParser = urllib.robotparser.RobotFileParser()
         self.contentDigest = {}
         self.http = httplib2.Http(".cache")
@@ -70,18 +74,18 @@ def updateURLs(self, tree):
         update the list of visited and unvisited URLs according to whether we
         have seen them before or not"""
         urls = set()
-        #Remove all links we have already visited
+        # Remove all links we have already visited
         for link in tree.findall(".//a"):
-                try:
-                    url = urllib.parse.urldefrag(link.attrib['href'])[0]
-                    if (url and url not in self.unvisitedURLs and url
+            try:
+                url = urllib.parse.urldefrag(link.attrib['href'])[0]
+                if (url and url not in self.unvisitedURLs and url
                         not in self.visitedURLs):
-                        urls.add(url)
-                except KeyError:
-                    pass
+                    urls.add(url)
+            except KeyError:
+                pass
 
-        #Remove all non-http URLs and add a suitable base URL where that is
-        #missing
+        # Remove all non-http URLs and add a suitable base URL where that is
+        # missing
         newUrls = set()
         for url in urls:
             splitURL = list(urllib.parse.urlsplit(url))
@@ -93,23 +97,22 @@ def updateURLs(self, tree):
         urls = newUrls
 
         responseHeaders = {}
-        #Now we want to find the content types of the links we haven't visited
+        # Now we want to find the content types of the links we haven't visited
         for url in urls:
             try:
                 resp, content = self.http.request(url, "HEAD")
                 responseHeaders[url] = resp
-            except AttributeError as KeyError:
-                #Don't know why this happens
+            except AttributeError:
+                # Don't know why this happens
                 pass
 
-
-        #Remove links not of content-type html or pages not found
-        #XXX - need to deal with other status codes?
+        # Remove links not of content-type html or pages not found
+        # XXX - need to deal with other status codes?
         toVisit = set([url for url in urls if url in responseHeaders and
-                      "html" in responseHeaders[url]['content-type'] and
-                      responseHeaders[url]['status'] == "200"])
+                       "html" in responseHeaders[url]['content-type'] and
+                       responseHeaders[url]['status'] == "200"])
 
-        #Now check we are allowed to spider the page
+        # Now check we are allowed to spider the page
         for url in toVisit:
             robotURL = list(urllib.parse.urlsplit(url)[:2])
             robotURL.extend(["robots.txt", "", ""])