diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 9e03b931..7020aa60 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -28,7 +28,18 @@ class BufferedIOBase(object):
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
-invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
+
+if utils.supports_lone_surrogates:
+ # Use one extra step of indirection and create surrogates with
+ # unichr. Not using this indirection would introduce an illegal
+ # unicode literal on platforms not supporting such lone
+ # surrogates.
+ invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
+ eval('"\\uD800-\\uDFFF"'))
+else:
+ invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +175,18 @@ def __init__(self, source):
"""
- # Craziness
- if len("\U0010FFFF") == 1:
+ if not utils.supports_lone_surrogates:
+ # Such platforms will have already checked for such
+ # surrogate errors, so no need to do this checking.
+ self.reportCharacterErrors = None
+ self.replaceCharactersRegexp = None
+ elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
- self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
+ self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
else:
self.reportCharacterErrors = self.characterErrorsUCS2
- self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?