|
| 1 | +--TEST-- |
| 2 | +Exhaustive test of mUTF-7 (IMAP) encoding verification and conversion |
| 3 | +--SKIPIF-- |
| 4 | +<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> |
| 5 | +--FILE-- |
| 6 | +<?php |
| 7 | +include('encoding_tests.inc'); |
| 8 | +mb_substitute_character(0x25); // '%' |
| 9 | + |
| 10 | +function utf16BE($utf8) { |
| 11 | + return mb_convert_encoding($utf8, 'UTF-16BE', 'UTF-8'); |
| 12 | +} |
| 13 | + |
| 14 | +function mBase64($str) { |
| 15 | + return str_replace('=', '', str_replace('/', ',', base64_encode($str))); |
| 16 | +} |
| 17 | + |
| 18 | +function testValid($from, $to, $bothWays = true) { |
| 19 | + testValidString($from, $to, 'UTF7-IMAP', 'UTF-8', $bothWays); |
| 20 | +} |
| 21 | +function testInvalid($from, $to) { |
| 22 | + testInvalidString($from, $to, 'UTF7-IMAP', 'UTF-8'); |
| 23 | +} |
| 24 | + |
| 25 | +/* An empty string is valid */ |
| 26 | +testValid("", ""); |
| 27 | +echo "Identification passes on empty string... good start!\n"; |
| 28 | + |
| 29 | +/* Identification and conversion of ASCII characters (minus &) */ |
| 30 | +for ($i = 0x20; $i <= 0x7E; $i++) { |
| 31 | + if ($i == 0x26) // '&' |
| 32 | + continue; |
| 33 | + testValid(chr($i), chr($i)); |
| 34 | +} |
| 35 | +echo "Testing all valid single-character ASCII strings... check!\n"; |
| 36 | + |
| 37 | +/* Identification and conversion of non-ASCII characters */ |
| 38 | +for ($i = 0; $i < 0x20; $i++) |
| 39 | + testInvalid(chr($i), "%"); |
| 40 | +for ($i = 0x7F; $i < 256; $i++) |
| 41 | + testInvalid(chr($i), "%"); |
| 42 | +echo "Non-ASCII characters convert to illegal char marker... yes!\n"; |
| 43 | + |
| 44 | +/* Identification of '&' when Base-64 encoded */ |
| 45 | +testValid("&" . mBase64(utf16BE("&")) . "-", "&", false); |
| 46 | +echo "& can be Base64-encoded... yes!\n"; |
| 47 | + |
| 48 | +/* Identification of unterminated & section */ |
| 49 | +identifyInvalidString("&", 'UTF7-IMAP'); |
| 50 | +identifyInvalidString("abc&", 'UTF7-IMAP'); |
| 51 | +identifyInvalidString("&" . mBase64(utf16BE("ハムサンドイッチ")), 'UTF7-IMAP'); |
| 52 | +echo "Testing unterminated & sections... yep!\n"; |
| 53 | + |
| 54 | +/* Identification of null shifts (& immediately after -) |
| 55 | + * |
| 56 | + * This is illegal according to the spec for mUTF-7 (IMAP), but currently we are letting |
| 57 | + * it pass... among other things, this makes it possible to concatenate UTF-7-IMAP |
| 58 | + * strings naively without the concatenated strings being treated as 'invalid' |
| 59 | + * |
| 60 | + * If ever we want to enforce this part of the spec, uncomment the following test */ |
| 61 | +/* |
| 62 | +identifyInvalidString("&" . mBase64(utf16BE("肉包子")) . "-&" . mBase64(utf16BE("冰淇淋")) . "-", 'UTF7-IMAP'); |
| 63 | +echo "Testing consecutive & sections which should have been merged... yep!\n"; |
| 64 | +*/ |
| 65 | + |
| 66 | +/* Conversion of Base64-encoded ASCII characters (excluding &) |
| 67 | + * These should be treated as erroneous and mb_substitute_character should apply */ |
| 68 | +for ($i = 0x20; $i <= 0x7E; $i++) { |
| 69 | + if ($i == 0x26) // '&' |
| 70 | + continue; |
| 71 | + testInvalid("&" . mBase64(utf16BE(chr($i))) . "-", "%"); |
| 72 | +} |
| 73 | +echo "Testing ASCII characters which are Base64-encoded... great!\n"; |
| 74 | + |
| 75 | +/* Conversion of & encoded as &- */ |
| 76 | +testValid("&-", "&"); |
| 77 | +testValid("abc&-", "abc&"); |
| 78 | +testValid("&-.&-", "&.&"); |
| 79 | +echo "Testing valid strings which use '&-' for '&'... good!\n"; |
| 80 | + |
| 81 | +/* Identification of & sections containing non-Base64 */ |
| 82 | + |
| 83 | +/* We'll use 6 character strings as a test, since 6 UTF-16 characters is just enough |
| 84 | + * to fit perfectly in Base64 encoding, with no padding */ |
| 85 | +$testString = mBase64(utf16BE("我是打酱油的")); |
| 86 | +if (strlen($testString) != 16) |
| 87 | + die("Erk!!"); |
| 88 | +for ($i = 0; $i < 256; $i++) { |
| 89 | + if ($i >= 0x30 && $i <= 0x39) // '0'..'9' |
| 90 | + continue; |
| 91 | + if ($i >= 0x41 && $i <= 0x5A) // 'A'..'Z' |
| 92 | + continue; |
| 93 | + if ($i >= 0x61 && $i <= 0x7A) // 'a'..'z' |
| 94 | + continue; |
| 95 | + if ($i == 0x2B || $i == 0x2C) // '+' or ',' |
| 96 | + continue; |
| 97 | + if ($i == 0x2D) // '-'... this will be interpreted as ending the Base64 section |
| 98 | + continue; |
| 99 | + identifyInvalidString("&" . substr($testString, 0, 11) . chr($i) . "-", 'UTF7-IMAP'); |
| 100 | +} |
| 101 | +echo "Identification fails when Base64 sections contain non-Base64 bytes... right!\n"; |
| 102 | + |
| 103 | +/* Tell me, please, how many ways can UTF-16BE text get messed up? |
| 104 | + * Why, that's elementary... */ |
| 105 | + |
| 106 | +/* 1. The second half of a surrogate pair could come first, */ |
| 107 | +$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE'); |
| 108 | +if (strlen($testString) != 4) |
| 109 | + die("Ouch!"); |
| 110 | +$testString = substr($testString, 2, 2) . substr($testString, 0, 2); |
| 111 | +identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP'); |
| 112 | + |
| 113 | +/* ...and we should detect this wherever it occurs */ |
| 114 | +$singleChar = mb_convert_encoding("1", 'UTF-16BE', 'ASCII'); |
| 115 | +$doubleChar = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE'); |
| 116 | +if (strlen($doubleChar) != 4) |
| 117 | + die("That was supposed to be a surrogate pair"); |
| 118 | +identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP'); |
| 119 | +identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP'); |
| 120 | +identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP'); |
| 121 | +identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP'); |
| 122 | +identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP'); |
| 123 | +identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP'); |
| 124 | + |
| 125 | +/* 2. The first half of a surrogate pair might be followed by an invalid 2nd part, */ |
| 126 | +$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE'); |
| 127 | +$testString = substr($testString, 0, 2) . mb_convert_encoding("a", 'UTF-16BE', 'ASCII'); |
| 128 | +identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP'); |
| 129 | + |
| 130 | +/* ...and we should also detect that wherever it occurs... */ |
| 131 | +identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP'); |
| 132 | +identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP'); |
| 133 | +identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP'); |
| 134 | + |
| 135 | +/* 3. The first half of a surrogate pair could come at the end of the string, */ |
| 136 | +$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE'); |
| 137 | +identifyInvalidString("&" . mBase64(substr($testString, 0, 2)) . "-", 'UTF7-IMAP'); |
| 138 | +identifyInvalidString("&" . mBase64($singleChar . substr($testString, 0, 2)) . "-", 'UTF7-IMAP'); |
| 139 | +identifyInvalidString("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)) . "-", 'UTF7-IMAP'); |
| 140 | + |
| 141 | +/* 4. Or, it could have an odd number of bytes in it! */ |
| 142 | +$testString = utf16BE("ドーナツ"); |
| 143 | +$testString = substr($testString, 0, strlen($testString) - 1); |
| 144 | +identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP'); |
| 145 | + |
| 146 | +/* And there is one bonus way to discombobulate your UTF-16BE when it is Base64-encoded... |
| 147 | + * The Base64 might not decode to an integral number of bytes |
| 148 | + * Or, equivalently... it might not be padded with zeroes (as the RFC requires) */ |
| 149 | +$testString = utf16BE("☺⛑"); |
| 150 | +if (strlen($testString) != 4) |
| 151 | + die("No good"); |
| 152 | +$encoded = mBase64($testString); |
| 153 | +if (strlen($encoded) != 6) |
| 154 | + die("Don't like that"); |
| 155 | +/* Mess up the padding by replacing the last Base64 character with ',', |
| 156 | + * which represents 63 (a number with a 1 in the last bit) */ |
| 157 | +identifyInvalidString("&" . substr($encoded, 0, strlen($encoded) - 1) . ",-", 'UTF7-IMAP'); |
| 158 | + |
| 159 | +echo "Identification fails when UTF-16 text is invalid... no sweat!\n"; |
| 160 | + |
| 161 | +/* OK, let's try valid Base64-encoded text now */ |
| 162 | + |
| 163 | +/* 2-byte char */ |
| 164 | +testValid("&" . mBase64(utf16BE("☺")) . "-", "☺"); |
| 165 | +/* 2 + 2 */ |
| 166 | +testValid("&" . mBase64(utf16BE("饺子")) . "-", "饺子"); |
| 167 | +/* 2 + 2 + 2 */ |
| 168 | +testValid("&" . mBase64(utf16BE("123")) . "-", "123"); |
| 169 | +/* 2 + 2 + 2 + 2 */ |
| 170 | +testValid("&" . mBase64(utf16BE("ᄚᄆᄇᄈ")) . "-", "ᄚᄆᄇᄈ"); |
| 171 | +/* 4 */ |
| 172 | +$longChar1 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE'); |
| 173 | +$longChar2 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-8', 'UTF-32BE'); |
| 174 | +testValid("&" . mBase64($longChar1) . "-", $longChar2); |
| 175 | +/* 2 + 4 */ |
| 176 | +testValid("&" . mBase64(utf16BE("饼") . $longChar1) . "-", "饼" . $longChar2); |
| 177 | +/* 4 + 2 */ |
| 178 | +testValid("&" . mBase64($longChar1 . utf16BE("饼")) . "-", $longChar2 . "饼"); |
| 179 | +/* 2 + 4 + 2 */ |
| 180 | +testValid("&" . mBase64(utf16BE("☺") . $longChar1 . utf16BE("饼")) . "-", "☺" . $longChar2 . "饼"); |
| 181 | +/* 2 + 2 + 4 */ |
| 182 | +testValid("&" . mBase64(utf16BE("西瓜") . $longChar1) . "-", "西瓜" . $longChar2); |
| 183 | +/* 2 + 2 + 4 + 2 */ |
| 184 | +testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . utf16BE("☺")) . "-", "西瓜" . $longChar2 . "☺"); |
| 185 | +/* 2 + 2 + 4 + 4 */ |
| 186 | +testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . $longChar1) . "-", "西瓜" . $longChar2 . $longChar2); |
| 187 | +/* 2 + 2 + 2 + 4 */ |
| 188 | +testValid("&" . mBase64(utf16BE("西红柿") . $longChar1) . "-", "西红柿" . $longChar2); |
| 189 | + |
| 190 | +/* Multiple sections of valid ASCII _and_ Base64-encoded text */ |
| 191 | +testValid("123&" . mBase64(utf16BE("123")) . "-abc&" . mBase64(utf16BE("☺")) . "-.", "123123abc☺."); |
| 192 | + |
| 193 | +echo "Identification and conversion of valid text is working... perfect!\n"; |
| 194 | + |
| 195 | +?> |
| 196 | +--EXPECT-- |
| 197 | +Identification passes on empty string... good start! |
| 198 | +Testing all valid single-character ASCII strings... check! |
| 199 | +Non-ASCII characters convert to illegal char marker... yes! |
| 200 | +& can be Base64-encoded... yes! |
| 201 | +Testing unterminated & sections... yep! |
| 202 | +Testing ASCII characters which are Base64-encoded... great! |
| 203 | +Testing valid strings which use '&-' for '&'... good! |
| 204 | +Identification fails when Base64 sections contain non-Base64 bytes... right! |
| 205 | +Identification fails when UTF-16 text is invalid... no sweat! |
| 206 | +Identification and conversion of valid text is working... perfect! |
0 commit comments