From c48822771eb4dc29ca72099254f1a72f9a7ce123 Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Fri, 2 May 2014 20:59:47 -0600
Subject: [PATCH 01/13] Do not directly use isolated surrogates in unicode
 literals for platforms besides Jython

---
 html5lib/inputstream.py | 44 ++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index 9e03b931..d6ca39a3 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -3,6 +3,7 @@
 from six.moves import http_client
 
 import codecs
+import platform
 import re
 
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
@@ -28,7 +29,19 @@ class BufferedIOBase(object):
 asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 
-invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
+
+if platform.python_implementation() == "Jython":
+    # Jython does not allow the use of solitary surrogate escapes
+    # (\uD800-\uDFFF) in literals or other usage. This is because it
+    # uses UTF-16, which is based on the use of such surrogates.
+    invalid_unicode_re = re.compile(invalid_unicode_template % "")
+else:
+    # Instead use one extra step of indirection and create surrogates with
+    # unichr
+    invalid_unicode_re = re.compile(invalid_unicode_template % (
+        "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +177,23 @@ def __init__(self, source):
 
         """
 
-        # Craziness
-        if len("\U0010FFFF") == 1:
+        if platform.python_implementation() == "Jython":
+            # By its nature Jython's UTF-16 support does not allow
+            # surrogate errors, so no need to do this checking.
+            self.reportCharacterErrors = None
+            self.replaceCharactersRegexp = None
+        elif len("\U0010FFFF") == 1:
             self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
+            self.replaceCharactersRegexp = re.compile("[%s-%s]" % (
+                unichr(0xD800), unichr(0xDFFF)))
         else:
             self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
+            self.replaceCharactersRegexp = re.compile(
+                "([%s-%s](?![%s-%s])|(?<![%s-%s])[%s-%s])" % (
+                    unichr(0xD800), unichr(0xDBFF),
+                    unichr(0xDC00), unichr(0xDFFF),
+                    unichr(0xD800), unichr(0xDBFF),
+                    unichr(0xDC00), unichr(0xDFFF)))
 
         # List of where new lines occur
         self.newLines = [0]
@@ -265,11 +288,14 @@ def readChunk(self, chunkSize=None):
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]
 
-        self.reportCharacterErrors(data)
+        if platform.python_implementation() != "Jython":
+            # data is already Unicode, so Jython already has dealt
+            # with any surrogate character errors, no need to go here
+            self.reportCharacterErrors(data)
 
-        # Replace invalid characters
-        # Note U+0000 is dealt with in the tokenizer
-        data = self.replaceCharactersRegexp.sub("\ufffd", data)
+            # Replace invalid characters
+            # Note U+0000 is dealt with in the tokenizer
+            data = self.replaceCharactersRegexp.sub("\ufffd", data)
 
         data = data.replace("\r\n", "\n")
         data = data.replace("\r", "\n")

From 89c732fff769950bcc09d7bb412530d1b09b2591 Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Fri, 2 May 2014 21:37:02 -0600
Subject: [PATCH 02/13] Use six.unichr for Python 3.x

---
 html5lib/inputstream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index d6ca39a3..ab47c710 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type
+from six import text_type, unichr
 from six.moves import http_client
 
 import codecs

From b182d1d59647aee388ce7c391f098d76ea2aadbf Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Mon, 16 Jun 2014 14:33:33 -0600
Subject: [PATCH 03/13] Ignore compiled Python classes for Jython

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 73d97fec..755066a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Because we never want compiled Python
 __pycache__/
 *.pyc
+*$py.class
 
 # Ignore stuff produced by distutils
 /build/

From f8d50aa6af87eb63ed7d76f82b252eec1bbb1d95 Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Mon, 16 Jun 2014 14:35:10 -0600
Subject: [PATCH 04/13] Pass on constructed tests in test_tokenizer that
 attempt to build HTMLUnicodeInputStream objects from unicode strings that
 contain isolated surrogates. Such tests are not meaningful on Jython which
 does not allow for invalid unicode strings to be decoded in the first place.

---
 html5lib/tests/test_tokenizer.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index 90315ab3..d33cc79d 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals
 
 import json
+import platform
 import warnings
 import re
 
@@ -122,9 +123,26 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
         return tokens["expected"] == tokens["received"]
 
 
+_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})")
+
+
 def unescape(test):
     def decode(inp):
-        return inp.encode("utf-8").decode("unicode-escape")
+        try:
+            return inp.encode("utf-8").decode("unicode-escape")
+        except UnicodeDecodeError:
+            possible_surrogate_match = _surrogateRe.search(inp)
+            if possible_surrogate_match and platform.python_implementation() == "Jython":
+                possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
+                if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
+                    # Not valid unicode input for Jython.
+                    #
+                    # NOTE it's not even possible to have such
+                    # isolated surrogates in unicode input streams in
+                    # Jython - the decoding to unicode would have
+                    # raised a similar UnicodeDecodeError.
+                    return None
+            raise
 
     test["input"] = decode(test["input"])
     for token in test["output"]:
@@ -183,6 +201,8 @@ def testTokenizer():
                         test["initialStates"] = ["Data state"]
                     if 'doubleEscaped' in test:
                         test = unescape(test)
+                        if test["input"] is None:
+                            continue  # Not valid input for this platform
                     for initialState in test["initialStates"]:
                         test["initialState"] = capitalize(initialState)
                         yield runTokenizerTest, test

From ce7ad47b1d2de17d5a6818d93d8819283dc03c13 Mon Sep 17 00:00:00 2001
From: Jim Baker <jim.baker@rackspace.com>
Date: Tue, 12 Aug 2014 20:33:34 +0200
Subject: [PATCH 05/13] Use utils.supports_lone_surrogates in place of
 Jython-specific tlogic

---
 html5lib/inputstream.py          | 24 ++++++++++--------------
 html5lib/tests/test_tokenizer.py | 13 +++++++------
 html5lib/utils.py                | 14 +++++++++++++-
 3 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index ab47c710..fb5ea759 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -3,7 +3,6 @@
 from six.moves import http_client
 
 import codecs
-import platform
 import re
 
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
@@ -32,16 +31,15 @@ class BufferedIOBase(object):
 
 invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
 
-if platform.python_implementation() == "Jython":
-    # Jython does not allow the use of solitary surrogate escapes
-    # (\uD800-\uDFFF) in literals or other usage. This is because it
-    # uses UTF-16, which is based on the use of such surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_template % "")
-else:
-    # Instead use one extra step of indirection and create surrogates with
-    # unichr
+if utils.supports_lone_surrogates:
+    # Use one extra step of indirection and create surrogates with
+    # unichr. Not using this indirection would introduce an illegal
+    # unicode literal on platforms not supporting such lone
+    # surrogates.
     invalid_unicode_re = re.compile(invalid_unicode_template % (
         "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
+else:
+    invalid_unicode_re = re.compile(invalid_unicode_template % "")
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -177,8 +175,8 @@ def __init__(self, source):
 
         """
 
-        if platform.python_implementation() == "Jython":
-            # By its nature Jython's UTF-16 support does not allow
+        if not utils.supports_lone_surrogates:
+            # Such platforms will have already checked for such
             # surrogate errors, so no need to do this checking.
             self.reportCharacterErrors = None
             self.replaceCharactersRegexp = None
@@ -288,9 +286,7 @@ def readChunk(self, chunkSize=None):
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]
 
-        if platform.python_implementation() != "Jython":
-            # data is already Unicode, so Jython already has dealt
-            # with any surrogate character errors, no need to go here
+        if utils.supports_lone_surrogates:
             self.reportCharacterErrors(data)
 
             # Replace invalid characters
diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index d33cc79d..7f4c02ba 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -1,14 +1,13 @@
 from __future__ import absolute_import, division, unicode_literals
 
 import json
-import platform
 import warnings
 import re
 
 from .support import get_data_files
 
 from html5lib.tokenizer import HTMLTokenizer
-from html5lib import constants
+from html5lib import constants, utils
 
 
 class TokenizerTestParser(object):
@@ -132,15 +131,17 @@ def decode(inp):
             return inp.encode("utf-8").decode("unicode-escape")
         except UnicodeDecodeError:
             possible_surrogate_match = _surrogateRe.search(inp)
-            if possible_surrogate_match and platform.python_implementation() == "Jython":
+            if possible_surrogate_match and not utils.supports_lone_surrogates:
                 possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
                 if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
-                    # Not valid unicode input for Jython.
+                    # Not valid unicode input for platforms that do
+                    # not have support for lone surrogates.
                     #
                     # NOTE it's not even possible to have such
                     # isolated surrogates in unicode input streams in
-                    # Jython - the decoding to unicode would have
-                    # raised a similar UnicodeDecodeError.
+                    # such platforms (like Jython) - the decoding to
+                    # unicode would have raised a similar
+                    # UnicodeDecodeError.
                     return None
             raise
 
diff --git a/html5lib/utils.py b/html5lib/utils.py
index 2f41f4df..62cd80ce 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 
+import platform
 from types import ModuleType
 
 try:
@@ -9,7 +10,18 @@
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
-           "surrogatePairToCodepoint", "moduleFactoryFactory"]
+           "surrogatePairToCodepoint", "moduleFactoryFactory",
+           "supports_lone_surrogates"]
+
+
+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
+# added to the below test. In general this would be any platform using
+# UTF-16 as its encoding of unicode strings, such as Jython. This is
+# because UTF-16 itself is based on the use of such surrogates, and
+# there is no mechanism to further escape such escapes.
+# 
+# Otherwise we assume such support.
+supports_lone_surrogates = platform.python_implementation() != "Jython"
 
 
 class MethodDispatcher(dict):

From 75e970b80a18f0bde654f70fd1867efb74be30d3 Mon Sep 17 00:00:00 2001
From: werner mendizabal <nonameentername@gmail.com>
Date: Tue, 7 Oct 2014 14:45:02 -0500
Subject: [PATCH 06/13] Fix flake8 trailing whitespace

---
 html5lib/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/utils.py b/html5lib/utils.py
index 62cd80ce..21c32c54 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -19,7 +19,7 @@
 # UTF-16 as its encoding of unicode strings, such as Jython. This is
 # because UTF-16 itself is based on the use of such surrogates, and
 # there is no mechanism to further escape such escapes.
-# 
+#
 # Otherwise we assume such support.
 supports_lone_surrogates = platform.python_implementation() != "Jython"
 

From a11981a480e26b9d91ac81c58c0471feb9b4c77a Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Sun, 26 Apr 2015 01:24:13 +0100
Subject: [PATCH 07/13] Use eval instead of unichr to deal with lone
 surrogates.

IMO this is a lot more readable.
---
 html5lib/inputstream.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index fb5ea759..dd713ace 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type, unichr
+from six import text_type
 from six.moves import http_client
 
 import codecs
@@ -29,17 +29,17 @@ class BufferedIOBase(object):
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
 
 
-invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
 
 if utils.supports_lone_surrogates:
     # Use one extra step of indirection and create surrogates with
     # unichr. Not using this indirection would introduce an illegal
     # unicode literal on platforms not supporting such lone
     # surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_template % (
-        "%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
+                                    eval('"\\uD800-\\uDFFF"'))
 else:
-    invalid_unicode_re = re.compile(invalid_unicode_template % "")
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
 
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                   0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -182,16 +182,11 @@ def __init__(self, source):
             self.replaceCharactersRegexp = None
         elif len("\U0010FFFF") == 1:
             self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile("[%s-%s]" % (
-                unichr(0xD800), unichr(0xDFFF)))
+            self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
         else:
             self.reportCharacterErrors = self.characterErrorsUCS2
             self.replaceCharactersRegexp = re.compile(
-                "([%s-%s](?![%s-%s])|(?<![%s-%s])[%s-%s])" % (
-                    unichr(0xD800), unichr(0xDBFF),
-                    unichr(0xDC00), unichr(0xDFFF),
-                    unichr(0xD800), unichr(0xDBFF),
-                    unichr(0xDC00), unichr(0xDFFF)))
+                eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
 
         # List of where new lines occur
         self.newLines = [0]

From f4ee9d3c81d6377f19dcfc19eedbaf39faa2f927 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Sun, 26 Apr 2015 01:25:00 +0100
Subject: [PATCH 08/13] Check whether what we want is available, not
 interlinking things.

---
 html5lib/inputstream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
index dd713ace..7020aa60 100644
--- a/html5lib/inputstream.py
+++ b/html5lib/inputstream.py
@@ -281,7 +281,7 @@ def readChunk(self, chunkSize=None):
                 self._bufferedCharacter = data[-1]
                 data = data[:-1]
 
-        if utils.supports_lone_surrogates:
+        if self.reportCharacterErrors:
             self.reportCharacterErrors(data)
 
             # Replace invalid characters

From 625303feaa83062c227bfa823ade063f8c0616f7 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Sun, 26 Apr 2015 02:05:30 +0100
Subject: [PATCH 09/13] unicode-escape is undocumented as a decoder, so
 implement our own.

This also makes it easier to see what's going wrong if something
does under Jython.
---
 html5lib/tests/test_tokenizer.py | 39 ++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index 7f4c02ba..0dfbb540 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -4,6 +4,8 @@
 import warnings
 import re
 
+from six import unichr
+
 from .support import get_data_files
 
 from html5lib.tokenizer import HTMLTokenizer
@@ -122,28 +124,31 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
         return tokens["expected"] == tokens["received"]
 
 
-_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})")
+_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
 
 
 def unescape(test):
     def decode(inp):
+        def repl(m):
+            if m.group(2) is not None:
+                high = int(m.group(1), 16)
+                low = int(m.group(2), 16)
+                if (0xD800 <= high <= 0xDBFF and
+                        0xDC00 <= low <= 0xDFFF):
+                    cp = ((high - 0xD800) << 10) + (low - 0xDc00) + 0x10000
+                    return unichr(cp)
+                else:
+                    return unichr(high) + unichr(low)
+            else:
+                return unichr(int(m.group(1), 16))
         try:
-            return inp.encode("utf-8").decode("unicode-escape")
-        except UnicodeDecodeError:
-            possible_surrogate_match = _surrogateRe.search(inp)
-            if possible_surrogate_match and not utils.supports_lone_surrogates:
-                possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
-                if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
-                    # Not valid unicode input for platforms that do
-                    # not have support for lone surrogates.
-                    #
-                    # NOTE it's not even possible to have such
-                    # isolated surrogates in unicode input streams in
-                    # such platforms (like Jython) - the decoding to
-                    # unicode would have raised a similar
-                    # UnicodeDecodeError.
-                    return None
-            raise
+            return _surrogateRe.sub(repl, inp)
+        except ValueError:
+            # This occurs when unichr throws ValueError, which should
+            # only be for a lone-surrogate.
+            if utils.supports_lone_surrogates:
+                raise
+            return None
 
     test["input"] = decode(test["input"])
     for token in test["output"]:

From e04fff83e37044ff0e8f895a7a66f3f85d69c32a Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Sun, 26 Apr 2015 02:41:24 +0100
Subject: [PATCH 10/13] Rewrite support_lone_surrogates to feature-sniff.

---
 html5lib/utils.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/html5lib/utils.py b/html5lib/utils.py
index 21c32c54..fdc18feb 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -1,8 +1,9 @@
 from __future__ import absolute_import, division, unicode_literals
 
-import platform
 from types import ModuleType
 
+from six import text_type
+
 try:
     import xml.etree.cElementTree as default_etree
 except ImportError:
@@ -15,13 +16,21 @@
 
 
 # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
-# added to the below test. In general this would be any platform using
-# UTF-16 as its encoding of unicode strings, such as Jython. This is
-# because UTF-16 itself is based on the use of such surrogates, and
-# there is no mechanism to further escape such escapes.
-#
-# Otherwise we assume such support.
-supports_lone_surrogates = platform.python_implementation() != "Jython"
+# caught by the below test. In general this would be any platform
+# using UTF-16 as its encoding of unicode strings, such as
+# Jython. This is because UTF-16 itself is based on the use of such
+# surrogates, and there is no mechanism to further escape such
+# escapes.
+try:
+    _x = eval('"\\uD800"')
+    if not isinstance(_x, text_type):
+        # We need this with u"" because of http://bugs.jython.org/issue2039
+        _x = eval('u"\\uD800"')
+        assert isinstance(_x, text_type)
+except:
+    supports_lone_surrogates = False
+else:
+    supports_lone_surrogates = True
 
 
 class MethodDispatcher(dict):

From db3042d84841ce0f42d02323c580507cf869db37 Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Tue, 28 Apr 2015 19:32:39 +0100
Subject: [PATCH 11/13] fixup! unicode-escape is undocumented as a decoder, so
 implement our own.

---
 html5lib/tests/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index 0dfbb540..64da03a0 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -135,7 +135,7 @@ def repl(m):
                 low = int(m.group(2), 16)
                 if (0xD800 <= high <= 0xDBFF and
                         0xDC00 <= low <= 0xDFFF):
-                    cp = ((high - 0xD800) << 10) + (low - 0xDc00) + 0x10000
+                    cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
                     return unichr(cp)
                 else:
                     return unichr(high) + unichr(low)

From e4ec7857d623c9ceb9b5b3d77827c08faffa220f Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Tue, 28 Apr 2015 22:59:42 +0100
Subject: [PATCH 12/13] fixup! unicode-escape is undocumented as a decoder, so
 implement our own.

---
 html5lib/tests/test_tokenizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index 64da03a0..154f34e5 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -133,8 +133,7 @@ def repl(m):
             if m.group(2) is not None:
                 high = int(m.group(1), 16)
                 low = int(m.group(2), 16)
-                if (0xD800 <= high <= 0xDBFF and
-                        0xDC00 <= low <= 0xDFFF):
+                if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
                     cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
                     return unichr(cp)
                 else:

From 5f5dd5797eaf244037dd2771a565390628dc935f Mon Sep 17 00:00:00 2001
From: Geoffrey Sneddon <geoffers@gmail.com>
Date: Tue, 28 Apr 2015 23:03:22 +0100
Subject: [PATCH 13/13] fixup! unicode-escape is undocumented as a decoder, so
 implement our own.

---
 html5lib/tests/test_tokenizer.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/html5lib/tests/test_tokenizer.py b/html5lib/tests/test_tokenizer.py
index 154f34e5..737ab679 100644
--- a/html5lib/tests/test_tokenizer.py
+++ b/html5lib/tests/test_tokenizer.py
@@ -129,6 +129,14 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
 
 def unescape(test):
     def decode(inp):
+        """Decode \uXXXX escapes
+
+        This decodes \uXXXX escapes, possibly into non-BMP characters when
+        two surrogate character escapes are adjacent to each other.
+        """
+        # This cannot be implemented using the unicode_escape codec
+        # because that requires its input be ISO-8859-1, and we need
+        # arbitrary unicode as input.
         def repl(m):
             if m.group(2) is not None:
                 high = int(m.group(1), 16)