Merge branch 'PHP-8.1'

alexdowad · alexdowad · commit 3f12d26e3af9 · 2022-04-16T20:32:12.000+02:00
* PHP-8.1:
  Error handling for UTF-8 complies with WHATWG specification
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c
@@ -127,9 +127,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 			CK((*filter->output_function)(s, filter->data));
 		} else {
 			CK(mbfl_filt_put_invalid_char(filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
-				goto retry;
-			}
+			goto retry;
 		}
 		break;
 	case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@@ -144,9 +142,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 			filter->status++;
 		} else {
 			CK(mbfl_filt_put_invalid_char(filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
-				goto retry;
-			}
+			goto retry;
 		}
 		break;
 	case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@@ -161,9 +157,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 			filter->status++;
 		} else {
 			CK(mbfl_filt_put_invalid_char(filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
-				goto retry;
-			}
+			goto retry;
 		}
 		break;
 	case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
@@ -172,9 +166,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
 			filter->status++;
 		} else {
 			CK(mbfl_filt_put_invalid_char(filter));
-			if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
-				goto retry;
-			}
+			goto retry;
 		}
 		break;
 
@@ -237,9 +229,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 				unsigned char c2 = *p++;
 				if ((c2 & 0xC0) != 0x80) {
 					*out++ = MBFL_BAD_INPUT;
-					if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
-						p--;
-					}
+					p--;
 				} else {
 					*out++ = ((c & 0x1F) << 6) | (c2 & 0x3F);
 				}
@@ -252,34 +242,21 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 				unsigned char c3 = *p++;
 				if ((c2 & 0xC0) != 0x80 || !((c2 >= 0x80 && c2 <= 0xBF) && ((c == 0xE0 && c2 >= 0xA0) || (c == 0xED && c2 < 0xA0) || (c > 0xE0 && c != 0xED)))) {
 					*out++ = MBFL_BAD_INPUT;
-					if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
-						p -= 2;
-					} else {
-						p--;
-					}
+					p -= 2;
 				} else if ((c3 & 0xC0) != 0x80) {
 					*out++ = MBFL_BAD_INPUT;
-					if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) {
-						p--;
-					}
+					p--;
 				} else {
 					uint32_t decoded = ((c & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
-					if (decoded >= 0xD800 && decoded <= 0xDFFF) {
+					if (decoded < 0x800 || (decoded >= 0xD800 && decoded <= 0xDFFF)) {
 						*out++ = MBFL_BAD_INPUT;
 					} else {
-						*out++ = (decoded < 0x800) ? MBFL_BAD_INPUT : decoded;
+						*out++ = decoded;
 					}
 				}
 			} else {
 				*out++ = MBFL_BAD_INPUT;
-				/* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
-				while (p < e) {
-					c = *p;
-					if ((c & 0xC0) != 0x80) {
-						if (c >= 0x80 && (c < 0xC2 || c > 0xF4))
-							p++;
-						break;
-					}
+				while (p < e && (*p & 0xC0) == 0x80) {
 					p++;
 				}
 			}
@@ -288,51 +265,28 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 				unsigned char c2 = *p++;
 				unsigned char c3 = *p++;
 				unsigned char c4 = *p++;
-				if ((c2 & 0xC0) != 0x80) {
+				/* If c == 0xF0 and c2 < 0x90, then this is an over-long code unit; it could have
+				 * fit in 3 bytes only. If c == 0xF4 and c2 >= 0x90, then this codepoint is
+				 * greater than U+10FFFF, which is the highest legal codepoint */
+				if ((c2 & 0xC0) != 0x80 || (c == 0xF0 && c2 < 0x90) || (c == 0xF4 && c2 >= 0x90)) {
 					*out++ = MBFL_BAD_INPUT;
-					if (c2 < 0x80 || (c2 >= 0xC2 && c2 <= 0xF4)) {
-						p -= 3;
-					} else {
-						p -= 2;
-					}
-				} else if ((c3 & 0xC0) != 0x80 || !((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
+					p -= 3;
+				} else if ((c3 & 0xC0) != 0x80) {
 					*out++ = MBFL_BAD_INPUT;
-					if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
-						if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4)) {
-							p -= 2;
-						} else {
-							p -= 3;
-						}
-					} else if (c3 < 0x80 || (c3 >= 0xC2 && c3 <= 0xF4)) {
-						p -= 2;
-					} else {
-						p--;
-					}
+					p -= 2;
 				} else if ((c4 & 0xC0) != 0x80) {
 					*out++ = MBFL_BAD_INPUT;
-					if (c4 < 0x80 || (c4 >= 0xC2 && c4 <= 0xF4)) {
-						p--;
-					}
+					p--;
 				} else {
 					uint32_t decoded = ((c & 0x7) << 18) | ((c2 & 0x3F) << 12) | ((c3 & 0x3F) << 6) | (c4 & 0x3F);
 					*out++ = (decoded < 0x10000) ? MBFL_BAD_INPUT : decoded;
 				}
 			} else {
 				*out++ = MBFL_BAD_INPUT;
-				/* Skip over some number of bytes to duplicate error-handling behavior of old implementation */
 				if (p < e) {
 					unsigned char c2 = *p;
-					if (!((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || (c > 0xF0 && c < 0xF4))) {
-						if (c2 >= 0x80 && (c2 < 0xC2 || c2 > 0xF4))
-							p++;
-					} else {
-						while (p < e) {
-							c = *p;
-							if ((c & 0xC0) != 0x80) {
-								if (c >= 0x80 && (c < 0xC2 || c > 0xF4))
-									p++;
-								break;
-							}
+					if ((c == 0xF0 && c2 >= 0x90) || (c == 0xF4 && c2 < 0x90) || c == 0xF2 || c == 0xF3) {
+						while (p < e && (*p & 0xC0) == 0x80) {
 							p++;
 						}
 					}
diff --git a/ext/mbstring/tests/illformed_utf_sequences.phpt b/ext/mbstring/tests/illformed_utf_sequences.phpt
@@ -21,28 +21,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
 var_dump(chk_enc("\x41\x42\x43", 0));
 var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
 var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
-var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
-var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
-var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
-var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
+var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
+var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
+var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
+var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
 var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
 var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
 var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
 var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
 
 var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
-var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
-var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
+var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
+var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
 var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
 var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
 
 var_dump(chk_enc("\xc1\xbf", 2));
 var_dump(chk_enc("\xc2\x80", 0));
 var_dump(chk_enc("\xdf\xbf", 0));
-var_dump(chk_enc("\xe0\x9f\xff", 2));
+var_dump(chk_enc("\xe0\x9f\xff", 3));
 var_dump(chk_enc("\xe0\xa0\x80", 2));
 var_dump(chk_enc("\xef\xbf\xbf", 0));
-var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
+var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
 var_dump(chk_enc("\xf0\x90\x80\x80", 0));
 var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
 var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
@@ -57,7 +57,7 @@ echo "UTF-8 and surrogates area\n";
 $out = '';
 $cnt = 0;
 for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
-    $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
+    $s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
     if ($s === false) {
         $cnt++;
     } else {
diff --git a/ext/mbstring/tests/utf8_error_handling.phpt b/ext/mbstring/tests/utf8_error_handling.phpt
@@ -0,0 +1,56 @@
+--TEST--
+Confirm error handling for UTF-8 complies with WHATWG spec
+--EXTENSIONS--
+mbstring
+--FILE--
+<?php
+/* The WHATWG specifies not just how web browsers should handle _valid_
+ * UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
+ * as how many error markers each invalid byte sequence should decode
+ * to).
+ * That specification is followed by the JavaScript Encoding API.
+ *
+ * The API documentation for mb_convert_encoding does not specify how
+ * many error markers we will emit for each possible invalid byte
+ * sequence, so we might as well comply with the WHATWG specification.
+ *
+ * Thanks to Martin Auswöger for pointing this out... and another big
+ * thanks for providing test cases!
+ *
+ * Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
+ */
+mb_substitute_character(0x25);
+
+$testCases = [
+  ["\x80", "%"],
+  ["\xFF", "%"],
+  ["\xC2\x7F", "%\x7F"],
+  ["\xC2\x80", "\xC2\x80"],
+  ["\xDF\xBF", "\xDF\xBF"],
+  ["\xDF\xC0", "%%"],
+  ["\xE0\xA0\x7F", "%\x7F"],
+  ["\xE0\xA0\x80", "\xE0\xA0\x80"],
+  ["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
+  ["\xEF\xBF\xC0", "%%"],
+  ["\xF0\x90\x80\x7F", "%\x7F"],
+  ["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
+  ["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
+  ["\xF4\x8F\xBF\xC0", "%%"],
+  ["\xFA\x80\x80\x80\x80", "%%%%%"],
+  ["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
+  ["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
+  ["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
+];
+
+foreach ($testCases as $testCase) {
+  $result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
+  if ($result !== $testCase[1]) {
+    die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
+  }
+}
+
+echo "All done!\n";
+
+?>
+--EXPECT--
+All done!
diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt
@@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE');
 
 $invalid = array(
   // Codepoints outside of valid 0-0x10FFFF range for Unicode
-  "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000
+  "\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
   "\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
   "\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF
 
   // Reserved range for UTF-16 surrogate pairs
-  "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2),     // CP 0xD800
-  "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2),     // CP 0xDBFF
-  "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2),     // CP 0xDFFF
+  "\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3),     // CP 0xD800
+  "\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDBFF
+  "\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3),     // CP 0xDFFF
 
   // Truncated characters
   "\xDF" => "\x00\x00\x00%",         // should have been 2-byte
@@ -788,8 +788,8 @@ $invalid = array(
 
   // Multi-byte characters which end too soon and go to a junk byte
   // (Which isn't even valid to start a new character)
-  "\xF0\xBF\xBF\xFF" => "\x00\x00\x00%",
-  "\xF0\xBF\xFF" => "\x00\x00\x00%",
+  "\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
+  "\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
 
   // Continuation bytes which appear outside of a MB char
   "\x80" => "\x00\x00\x00%",
@@ -799,8 +799,8 @@ $invalid = array(
   // Overlong code units
   // (Using more bytes than needed to encode a character)
   "\xC1\xBF" => str_repeat("\x00\x00\x00%", 2),        // didn't need 2 bytes
-  "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2),    // didn't need 3 bytes
-  "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes
+  "\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3),    // didn't need 3 bytes
+  "\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
 );
 
 testInvalidCodepoints($invalid, 'UTF-8');