Skip to content

Commit e0a5736

Browse files
committed
grapheme_extract should pass over invalid surrogate halves
Many systems incorrectly encode surrogate halves from a UTF-16 stream into UTF-8 as two three-byte characters instead of the proper four-byte sequence. These are invalid charaters in UTF-8 and should be skipped when decoding with `grapheme_extract` but it’s not currently handling these properly. > If offset does not point to the first byte of a UTF-8 character, > the start position is moved to the next character boundary. For example, U+1F170 (d83c dd70) should encode as F0 9F 85 B0, but when applying the UTF-8 encoder invalidly to d83c, the output would be ED A0 BD. This entire span of bytes is invalid UTF-8. ```php grapheme_extract( "\xED\xA0\xBDa", 1, GRAPHEME_EXTR_COUNT, 0, $next ); // returns "\xED", an invalid UTF-8 byte sequence // $next === 1, pointing into the middle of the invalid sequence ``` Instead, it should return “a” and point `$next` to the end of the string.
1 parent 10ccb6b commit e0a5736

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

ext/intl/tests/grapheme2.phpt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,8 @@ function ut_main()
590590
array( $char_a_ring_nfd . "bcde" . $char_a_ring_nfd . "f", 4, 5, 11, "de" . $char_a_ring_nfd . "f" ),
591591
array( $char_a_ring_nfd . "bcde" . $char_a_ring_nfd . "f", 4, -6, 11, "de" . $char_a_ring_nfd . "f" ),
592592

593-
array( "\x95\x00a\x85b", 1, 0, 2, "\x00" ),
593+
array( "\x95\x00a\x85b", 1, 0, 2, "\x00" ), // Invalid UTF-8 leading byte.
594+
array( "\xED\xA0\xBDa", 1, 0, 3, "a" ), // Incorrectly-encoded surrogate half.
594595

595596
array( $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd, 3, $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd ),
596597
array( $char_a_ring_nfd . $char_o_diaeresis_nfd . $char_o_diaeresis_nfd, 2, $char_a_ring_nfd . $char_o_diaeresis_nfd ),
@@ -744,6 +745,7 @@ function ut_main()
744745
array( $char_o_diaeresis_nfd . "abc" . $char_a_ring_nfd . "xyz", 8, 5, "c" . $char_a_ring_nfd . "xyz" ),
745746
array( $char_o_diaeresis_nfd . "abc" . $char_a_ring_nfd . "xyz", 8, 6, $char_a_ring_nfd . "xyz" ),
746747

748+
array( "\xED\xA0\xBDa", 1, 0, 3, "a" ), // Incorrectly-encoded surrogate half.
747749
);
748750

749751
foreach( $tests as $test ) {
@@ -1137,6 +1139,7 @@ extract from "a%CC%8Abcde" "2" graphemes - grapheme_extract starting at byte pos
11371139
extract from "a%CC%8Abcdea%CC%8Af" "4" graphemes - grapheme_extract starting at byte position 5 with $next = dea%CC%8Af == dea%CC%8Af $next=11 == 11
11381140
extract from "a%CC%8Abcdea%CC%8Af" "4" graphemes - grapheme_extract starting at byte position -6 with $next = dea%CC%8Af == dea%CC%8Af $next=11 == 11
11391141
extract from "%95%00a%85b" "1" graphemes - grapheme_extract starting at byte position 0 with $next = %00 == %00 $next=2 == 2
1142+
extract from "%ED%A0%BDa" "1" graphemes - grapheme_extract starting at byte position 0 with $next = a == a $next=3 == 3
11401143
extract from "a%CC%8Ao%CC%88o%CC%88" "3" graphemes - grapheme_extract = a%CC%8Ao%CC%88o%CC%88 == a%CC%8Ao%CC%88o%CC%88
11411144
extract from "a%CC%8Ao%CC%88o%CC%88" "2" graphemes - grapheme_extract = a%CC%8Ao%CC%88 == a%CC%8Ao%CC%88
11421145
extract from "a%CC%8Ao%CC%88c" "1" graphemes - grapheme_extract = a%CC%8A == a%CC%8A
@@ -1221,3 +1224,4 @@ extract from "o%CC%88abca%CC%8Axyz" "8" graphemes - grapheme_extract GRAPHEME_EX
12211224
extract from "o%CC%88abca%CC%8Axyz" "8" graphemes - grapheme_extract GRAPHEME_EXTR_MAXCHARS starting at byte position 4 = bca%CC%8Axyz == bca%CC%8Axyz
12221225
extract from "o%CC%88abca%CC%8Axyz" "8" graphemes - grapheme_extract GRAPHEME_EXTR_MAXCHARS starting at byte position 5 = ca%CC%8Axyz == ca%CC%8Axyz
12231226
extract from "o%CC%88abca%CC%8Axyz" "8" graphemes - grapheme_extract GRAPHEME_EXTR_MAXCHARS starting at byte position 6 = a%CC%8Axyz == a%CC%8Axyz
1227+
extract from "%ED%A0%BDa" "1" graphemes - grapheme_extract GRAPHEME_EXTR_MAXCHARS starting at byte position 0 = a == a

0 commit comments

Comments
 (0)