Skip to content

Do not directly use isolated surrogates in unicode literals #185

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 26 additions & 9 deletions html5lib/inputstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,18 @@ class BufferedIOBase(object):
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])

invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")

invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"

if utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# unichr. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
eval('"\\uD800-\\uDFFF"'))
else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)

non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
Expand Down Expand Up @@ -164,13 +175,18 @@ def __init__(self, source):

"""

# Craziness
if len("\U0010FFFF") == 1:
if not utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
else:
self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
self.replaceCharactersRegexp = re.compile(
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))

# List of where new lines occur
self.newLines = [0]
Expand Down Expand Up @@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):
self._bufferedCharacter = data[-1]
data = data[:-1]

self.reportCharacterErrors(data)
if self.reportCharacterErrors:
self.reportCharacterErrors(data)

# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)

data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
Expand Down
37 changes: 35 additions & 2 deletions html5lib/tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import warnings
import re

from six import unichr

from .support import get_data_files

from html5lib.tokenizer import HTMLTokenizer
from html5lib import constants
from html5lib import constants, utils


class TokenizerTestParser(object):
Expand Down Expand Up @@ -122,9 +124,38 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
return tokens["expected"] == tokens["received"]


_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")


def unescape(test):
def decode(inp):
return inp.encode("utf-8").decode("unicode-escape")
"""Decode \uXXXX escapes

This decodes \uXXXX escapes, possibly into non-BMP characters when
two surrogate character escapes are adjacent to each other.
"""
# This cannot be implemented using the unicode_escape codec
# because that requires its input be ISO-8859-1, and we need
# arbitrary unicode as input.
def repl(m):
if m.group(2) is not None:
high = int(m.group(1), 16)
low = int(m.group(2), 16)
if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
return unichr(cp)
else:
return unichr(high) + unichr(low)
else:
return unichr(int(m.group(1), 16))
try:
return _surrogateRe.sub(repl, inp)
except ValueError:
# This occurs when unichr throws ValueError, which should
# only be for a lone-surrogate.
if utils.supports_lone_surrogates:
raise
return None

test["input"] = decode(test["input"])
for token in test["output"]:
Expand Down Expand Up @@ -183,6 +214,8 @@ def testTokenizer():
test["initialStates"] = ["Data state"]
if 'doubleEscaped' in test:
test = unescape(test)
if test["input"] is None:
continue # Not valid input for this platform
for initialState in test["initialStates"]:
test["initialState"] = capitalize(initialState)
yield runTokenizerTest, test
23 changes: 22 additions & 1 deletion html5lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,35 @@

from types import ModuleType

from six import text_type

try:
import xml.etree.cElementTree as default_etree
except ImportError:
import xml.etree.ElementTree as default_etree


__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory"]
"surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"]


# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# caught by the below test. In general this would be any platform
# using UTF-16 as its encoding of unicode strings, such as
# Jython. This is because UTF-16 itself is based on the use of such
# surrogates, and there is no mechanism to further escape such
# escapes.
try:
_x = eval('"\\uD800"')
if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"')
assert isinstance(_x, text_type)
except:
supports_lone_surrogates = False
else:
supports_lone_surrogates = True


class MethodDispatcher(dict):
Expand Down