From c48822771eb4dc29ca72099254f1a72f9a7ce123 Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Fri, 2 May 2014 20:59:47 -0600 Subject: [PATCH 01/13] Do not directly use isolated surrogates in unicode literals for platforms besides Jython --- html5lib/inputstream.py | 44 ++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 9e03b931..d6ca39a3 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -3,6 +3,7 @@ from six.moves import http_client import codecs +import platform import re from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase @@ -28,7 +29,19 @@ class BufferedIOBase(object): asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") + +invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]" + +if platform.python_implementation() == "Jython": + # Jython does not allow the use of solitary surrogate escapes + # (\uD800-\uDFFF) in literals or other usage. This is because it + # uses UTF-16, which is based on the use of such surrogates. + invalid_unicode_re = re.compile(invalid_unicode_template % "") +else: + # Instead use one extra step of indirection and create surrogates with + # unichr + invalid_unicode_re = re.compile(invalid_unicode_template % ( + "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),)) non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -164,13 +177,23 @@ def __init__(self, source): """ - # Craziness - if len("\U0010FFFF") == 1: + if platform.python_implementation() == "Jython": + # By its nature Jython's UTF-16 support does not allow + # surrogate errors, so no need to do this checking. + self.reportCharacterErrors = None + self.replaceCharactersRegexp = None + elif len("\U0010FFFF") == 1: self.reportCharacterErrors = self.characterErrorsUCS4 - self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]") + self.replaceCharactersRegexp = re.compile("[%s-%s]" % ( + unichr(0xD800), unichr(0xDFFF))) else: self.reportCharacterErrors = self.characterErrorsUCS2 - self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(? Date: Fri, 2 May 2014 21:37:02 -0600 Subject: [PATCH 02/13] Use six.unichr for Python 3.x --- html5lib/inputstream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index d6ca39a3..ab47c710 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from six import text_type, unichr from six.moves import http_client import codecs From b182d1d59647aee388ce7c391f098d76ea2aadbf Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Mon, 16 Jun 2014 14:33:33 -0600 Subject: [PATCH 03/13] Ignore compiled Python classes for Jython --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 73d97fec..755066a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Because we never want compiled Python __pycache__/ *.pyc +*$py.class # Ignore stuff produced by distutils /build/ From f8d50aa6af87eb63ed7d76f82b252eec1bbb1d95 Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Mon, 16 Jun 2014 14:35:10 -0600 Subject: [PATCH 04/13] Pass on constructed tests in test_tokenizer that attempt to build HTMLUnicodeInputStream objects from unicode strings that contain isolated surrogates. Such tests are not meaningful on Jython which does not allow for invalid unicode strings to be decoded in the first place. --- html5lib/tests/test_tokenizer.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index 90315ab3..d33cc79d 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, unicode_literals import json +import platform import warnings import re @@ -122,9 +123,26 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder, return tokens["expected"] == tokens["received"] +_surrogateRe = re.compile(r"\\u(?P[0-9A-Fa-f]{4})") + + def unescape(test): def decode(inp): - return inp.encode("utf-8").decode("unicode-escape") + try: + return inp.encode("utf-8").decode("unicode-escape") + except UnicodeDecodeError: + possible_surrogate_match = _surrogateRe.search(inp) + if possible_surrogate_match and platform.python_implementation() == "Jython": + possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16) + if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF: + # Not valid unicode input for Jython. + # + # NOTE it's not even possible to have such + # isolated surrogates in unicode input streams in + # Jython - the decoding to unicode would have + # raised a similar UnicodeDecodeError. + return None + raise test["input"] = decode(test["input"]) for token in test["output"]: @@ -183,6 +201,8 @@ def testTokenizer(): test["initialStates"] = ["Data state"] if 'doubleEscaped' in test: test = unescape(test) + if test["input"] is None: + continue # Not valid input for this platform for initialState in test["initialStates"]: test["initialState"] = capitalize(initialState) yield runTokenizerTest, test From ce7ad47b1d2de17d5a6818d93d8819283dc03c13 Mon Sep 17 00:00:00 2001 From: Jim Baker Date: Tue, 12 Aug 2014 20:33:34 +0200 Subject: [PATCH 05/13] Use utils.supports_lone_surrogates in place of Jython-specific tlogic --- html5lib/inputstream.py | 24 ++++++++++-------------- html5lib/tests/test_tokenizer.py | 13 +++++++------ html5lib/utils.py | 14 +++++++++++++- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index ab47c710..fb5ea759 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -3,7 +3,6 @@ from six.moves import http_client import codecs -import platform import re from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase @@ -32,16 +31,15 @@ class BufferedIOBase(object): invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]" -if platform.python_implementation() == "Jython": - # Jython does not allow the use of solitary surrogate escapes - # (\uD800-\uDFFF) in literals or other usage. This is because it - # uses UTF-16, which is based on the use of such surrogates. - invalid_unicode_re = re.compile(invalid_unicode_template % "") -else: - # Instead use one extra step of indirection and create surrogates with - # unichr +if utils.supports_lone_surrogates: + # Use one extra step of indirection and create surrogates with + # unichr. Not using this indirection would introduce an illegal + # unicode literal on platforms not supporting such lone + # surrogates. invalid_unicode_re = re.compile(invalid_unicode_template % ( "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),)) +else: + invalid_unicode_re = re.compile(invalid_unicode_template % "") non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -177,8 +175,8 @@ def __init__(self, source): """ - if platform.python_implementation() == "Jython": - # By its nature Jython's UTF-16 support does not allow + if not utils.supports_lone_surrogates: + # Such platforms will have already checked for such # surrogate errors, so no need to do this checking. self.reportCharacterErrors = None self.replaceCharactersRegexp = None @@ -288,9 +286,7 @@ def readChunk(self, chunkSize=None): self._bufferedCharacter = data[-1] data = data[:-1] - if platform.python_implementation() != "Jython": - # data is already Unicode, so Jython already has dealt - # with any surrogate character errors, no need to go here + if utils.supports_lone_surrogates: self.reportCharacterErrors(data) # Replace invalid characters diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index d33cc79d..7f4c02ba 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -1,14 +1,13 @@ from __future__ import absolute_import, division, unicode_literals import json -import platform import warnings import re from .support import get_data_files from html5lib.tokenizer import HTMLTokenizer -from html5lib import constants +from html5lib import constants, utils class TokenizerTestParser(object): @@ -132,15 +131,17 @@ def decode(inp): return inp.encode("utf-8").decode("unicode-escape") except UnicodeDecodeError: possible_surrogate_match = _surrogateRe.search(inp) - if possible_surrogate_match and platform.python_implementation() == "Jython": + if possible_surrogate_match and not utils.supports_lone_surrogates: possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16) if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF: - # Not valid unicode input for Jython. + # Not valid unicode input for platforms that do + # not have support for lone surrogates. # # NOTE it's not even possible to have such # isolated surrogates in unicode input streams in - # Jython - the decoding to unicode would have - # raised a similar UnicodeDecodeError. + # such platforms (like Jython) - the decoding to + # unicode would have raised a similar + # UnicodeDecodeError. return None raise diff --git a/html5lib/utils.py b/html5lib/utils.py index 2f41f4df..62cd80ce 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, division, unicode_literals +import platform from types import ModuleType try: @@ -9,7 +10,18 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", - "surrogatePairToCodepoint", "moduleFactoryFactory"] + "surrogatePairToCodepoint", "moduleFactoryFactory", + "supports_lone_surrogates"] + + +# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be +# added to the below test. In general this would be any platform using +# UTF-16 as its encoding of unicode strings, such as Jython. This is +# because UTF-16 itself is based on the use of such surrogates, and +# there is no mechanism to further escape such escapes. +# +# Otherwise we assume such support. +supports_lone_surrogates = platform.python_implementation() != "Jython" class MethodDispatcher(dict): From 75e970b80a18f0bde654f70fd1867efb74be30d3 Mon Sep 17 00:00:00 2001 From: werner mendizabal Date: Tue, 7 Oct 2014 14:45:02 -0500 Subject: [PATCH 06/13] Fix flake8 trailing whitespace --- html5lib/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/utils.py b/html5lib/utils.py index 62cd80ce..21c32c54 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -19,7 +19,7 @@ # UTF-16 as its encoding of unicode strings, such as Jython. This is # because UTF-16 itself is based on the use of such surrogates, and # there is no mechanism to further escape such escapes. -# +# # Otherwise we assume such support. supports_lone_surrogates = platform.python_implementation() != "Jython" From a11981a480e26b9d91ac81c58c0471feb9b4c77a Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sun, 26 Apr 2015 01:24:13 +0100 Subject: [PATCH 07/13] Use eval instead of unichr to deal with lone surrogates. IMO this is a lot more readable. --- html5lib/inputstream.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index fb5ea759..dd713ace 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type, unichr +from six import text_type from six.moves import http_client import codecs @@ -29,17 +29,17 @@ class BufferedIOBase(object): spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]" +invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" if utils.supports_lone_surrogates: # Use one extra step of indirection and create surrogates with # unichr. Not using this indirection would introduce an illegal # unicode literal on platforms not supporting such lone # surrogates. - invalid_unicode_re = re.compile(invalid_unicode_template % ( - "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),)) + invalid_unicode_re = re.compile(invalid_unicode_no_surrogate + + eval('"\\uD800-\\uDFFF"')) else: - invalid_unicode_re = re.compile(invalid_unicode_template % "") + invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -182,16 +182,11 @@ def __init__(self, source): self.replaceCharactersRegexp = None elif len("\U0010FFFF") == 1: self.reportCharacterErrors = self.characterErrorsUCS4 - self.replaceCharactersRegexp = re.compile("[%s-%s]" % ( - unichr(0xD800), unichr(0xDFFF))) + self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"')) else: self.reportCharacterErrors = self.characterErrorsUCS2 self.replaceCharactersRegexp = re.compile( - "([%s-%s](?![%s-%s])|(? Date: Sun, 26 Apr 2015 01:25:00 +0100 Subject: [PATCH 08/13] Check whether what we want is available, not interlinking things. --- html5lib/inputstream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index dd713ace..7020aa60 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -281,7 +281,7 @@ def readChunk(self, chunkSize=None): self._bufferedCharacter = data[-1] data = data[:-1] - if utils.supports_lone_surrogates: + if self.reportCharacterErrors: self.reportCharacterErrors(data) # Replace invalid characters From 625303feaa83062c227bfa823ade063f8c0616f7 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sun, 26 Apr 2015 02:05:30 +0100 Subject: [PATCH 09/13] unicode-escape is undocumented as a decoder, so implement our own. This also makes it easier to see what's going wrong if something does under Jython. --- html5lib/tests/test_tokenizer.py | 39 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index 7f4c02ba..0dfbb540 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -4,6 +4,8 @@ import warnings import re +from six import unichr + from .support import get_data_files from html5lib.tokenizer import HTMLTokenizer @@ -122,28 +124,31 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder, return tokens["expected"] == tokens["received"] -_surrogateRe = re.compile(r"\\u(?P[0-9A-Fa-f]{4})") +_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?") def unescape(test): def decode(inp): + def repl(m): + if m.group(2) is not None: + high = int(m.group(1), 16) + low = int(m.group(2), 16) + if (0xD800 <= high <= 0xDBFF and + 0xDC00 <= low <= 0xDFFF): + cp = ((high - 0xD800) << 10) + (low - 0xDc00) + 0x10000 + return unichr(cp) + else: + return unichr(high) + unichr(low) + else: + return unichr(int(m.group(1), 16)) try: - return inp.encode("utf-8").decode("unicode-escape") - except UnicodeDecodeError: - possible_surrogate_match = _surrogateRe.search(inp) - if possible_surrogate_match and not utils.supports_lone_surrogates: - possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16) - if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF: - # Not valid unicode input for platforms that do - # not have support for lone surrogates. - # - # NOTE it's not even possible to have such - # isolated surrogates in unicode input streams in - # such platforms (like Jython) - the decoding to - # unicode would have raised a similar - # UnicodeDecodeError. - return None - raise + return _surrogateRe.sub(repl, inp) + except ValueError: + # This occurs when unichr throws ValueError, which should + # only be for a lone-surrogate. + if utils.supports_lone_surrogates: + raise + return None test["input"] = decode(test["input"]) for token in test["output"]: From e04fff83e37044ff0e8f895a7a66f3f85d69c32a Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sun, 26 Apr 2015 02:41:24 +0100 Subject: [PATCH 10/13] Rewrite support_lone_surrogates to feature-sniff. --- html5lib/utils.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/html5lib/utils.py b/html5lib/utils.py index 21c32c54..fdc18feb 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -1,8 +1,9 @@ from __future__ import absolute_import, division, unicode_literals -import platform from types import ModuleType +from six import text_type + try: import xml.etree.cElementTree as default_etree except ImportError: @@ -15,13 +16,21 @@ # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be -# added to the below test. In general this would be any platform using -# UTF-16 as its encoding of unicode strings, such as Jython. This is -# because UTF-16 itself is based on the use of such surrogates, and -# there is no mechanism to further escape such escapes. -# -# Otherwise we assume such support. -supports_lone_surrogates = platform.python_implementation() != "Jython" +# caught by the below test. In general this would be any platform +# using UTF-16 as its encoding of unicode strings, such as +# Jython. This is because UTF-16 itself is based on the use of such +# surrogates, and there is no mechanism to further escape such +# escapes. +try: + _x = eval('"\\uD800"') + if not isinstance(_x, text_type): + # We need this with u"" because of http://bugs.jython.org/issue2039 + _x = eval('u"\\uD800"') + assert isinstance(_x, text_type) +except: + supports_lone_surrogates = False +else: + supports_lone_surrogates = True class MethodDispatcher(dict): From db3042d84841ce0f42d02323c580507cf869db37 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 28 Apr 2015 19:32:39 +0100 Subject: [PATCH 11/13] fixup! unicode-escape is undocumented as a decoder, so implement our own. --- html5lib/tests/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index 0dfbb540..64da03a0 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -135,7 +135,7 @@ def repl(m): low = int(m.group(2), 16) if (0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF): - cp = ((high - 0xD800) << 10) + (low - 0xDc00) + 0x10000 + cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 return unichr(cp) else: return unichr(high) + unichr(low) From e4ec7857d623c9ceb9b5b3d77827c08faffa220f Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 28 Apr 2015 22:59:42 +0100 Subject: [PATCH 12/13] fixup! unicode-escape is undocumented as a decoder, so implement our own. --- html5lib/tests/test_tokenizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index 64da03a0..154f34e5 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -133,8 +133,7 @@ def repl(m): if m.group(2) is not None: high = int(m.group(1), 16) low = int(m.group(2), 16) - if (0xD800 <= high <= 0xDBFF and - 0xDC00 <= low <= 0xDFFF): + if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF: cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 return unichr(cp) else: From 5f5dd5797eaf244037dd2771a565390628dc935f Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 28 Apr 2015 23:03:22 +0100 Subject: [PATCH 13/13] fixup! unicode-escape is undocumented as a decoder, so implement our own. --- html5lib/tests/test_tokenizer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py index 154f34e5..737ab679 100644 --- a/html5lib/tests/test_tokenizer.py +++ b/html5lib/tests/test_tokenizer.py @@ -129,6 +129,14 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder, def unescape(test): def decode(inp): + """Decode \uXXXX escapes + + This decodes \uXXXX escapes, possibly into non-BMP characters when + two surrogate character escapes are adjacent to each other. + """ + # This cannot be implemented using the unicode_escape codec + # because that requires its input be ISO-8859-1, and we need + # arbitrary unicode as input. def repl(m): if m.group(2) is not None: high = int(m.group(1), 16)