Skip to content

Commit f5f3ee7

Browse files
committed
Add test suite for mUTF-7 (IMAP) encoding
1 parent 78dc160 commit f5f3ee7

File tree

1 file changed

+206
-0
lines changed

1 file changed

+206
-0
lines changed
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
--TEST--
2+
Exhaustive test of mUTF-7 (IMAP) encoding verification and conversion
3+
--SKIPIF--
4+
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
5+
--FILE--
6+
<?php
7+
include('encoding_tests.inc');
8+
mb_substitute_character(0x25); // '%'
9+
10+
function utf16BE($utf8) {
11+
return mb_convert_encoding($utf8, 'UTF-16BE', 'UTF-8');
12+
}
13+
14+
function mBase64($str) {
15+
return str_replace('=', '', str_replace('/', ',', base64_encode($str)));
16+
}
17+
18+
function testValid($from, $to, $bothWays = true) {
19+
testValidString($from, $to, 'UTF7-IMAP', 'UTF-8', $bothWays);
20+
}
21+
function testInvalid($from, $to) {
22+
testInvalidString($from, $to, 'UTF7-IMAP', 'UTF-8');
23+
}
24+
25+
/* An empty string is valid */
26+
testValid("", "");
27+
echo "Identification passes on empty string... good start!\n";
28+
29+
/* Identification and conversion of ASCII characters (minus &) */
30+
for ($i = 0x20; $i <= 0x7E; $i++) {
31+
if ($i == 0x26) // '&'
32+
continue;
33+
testValid(chr($i), chr($i));
34+
}
35+
echo "Testing all valid single-character ASCII strings... check!\n";
36+
37+
/* Identification and conversion of non-ASCII characters */
38+
for ($i = 0; $i < 0x20; $i++)
39+
testInvalid(chr($i), "%");
40+
for ($i = 0x7F; $i < 256; $i++)
41+
testInvalid(chr($i), "%");
42+
echo "Non-ASCII characters convert to illegal char marker... yes!\n";
43+
44+
/* Identification of '&' when Base-64 encoded */
45+
testValid("&" . mBase64(utf16BE("&")) . "-", "&", false);
46+
echo "& can be Base64-encoded... yes!\n";
47+
48+
/* Identification of unterminated & section */
49+
identifyInvalidString("&", 'UTF7-IMAP');
50+
identifyInvalidString("abc&", 'UTF7-IMAP');
51+
identifyInvalidString("&" . mBase64(utf16BE("ハムサンドイッチ")), 'UTF7-IMAP');
52+
echo "Testing unterminated & sections... yep!\n";
53+
54+
/* Identification of null shifts (& immediately after -)
55+
*
56+
* This is illegal according to the spec for mUTF-7 (IMAP), but currently we are letting
57+
* it pass... among other things, this makes it possible to concatenate UTF-7-IMAP
58+
* strings naively without the concatenated strings being treated as 'invalid'
59+
*
60+
* If ever we want to enforce this part of the spec, uncomment the following test */
61+
/*
62+
identifyInvalidString("&" . mBase64(utf16BE("肉包子")) . "-&" . mBase64(utf16BE("冰淇淋")) . "-", 'UTF7-IMAP');
63+
echo "Testing consecutive & sections which should have been merged... yep!\n";
64+
*/
65+
66+
/* Conversion of Base64-encoded ASCII characters (excluding &)
67+
* These should be treated as erroneous and mb_substitute_character should apply */
68+
for ($i = 0x20; $i <= 0x7E; $i++) {
69+
if ($i == 0x26) // '&'
70+
continue;
71+
testInvalid("&" . mBase64(utf16BE(chr($i))) . "-", "%");
72+
}
73+
echo "Testing ASCII characters which are Base64-encoded... great!\n";
74+
75+
/* Conversion of & encoded as &- */
76+
testValid("&-", "&");
77+
testValid("abc&-", "abc&");
78+
testValid("&-.&-", "&.&");
79+
echo "Testing valid strings which use '&-' for '&'... good!\n";
80+
81+
/* Identification of & sections containing non-Base64 */
82+
83+
/* We'll use 6 character strings as a test, since 6 UTF-16 characters is just enough
84+
* to fit perfectly in Base64 encoding, with no padding */
85+
$testString = mBase64(utf16BE("我是打酱油的"));
86+
if (strlen($testString) != 16)
87+
die("Erk!!");
88+
for ($i = 0; $i < 256; $i++) {
89+
if ($i >= 0x30 && $i <= 0x39) // '0'..'9'
90+
continue;
91+
if ($i >= 0x41 && $i <= 0x5A) // 'A'..'Z'
92+
continue;
93+
if ($i >= 0x61 && $i <= 0x7A) // 'a'..'z'
94+
continue;
95+
if ($i == 0x2B || $i == 0x2C) // '+' or ','
96+
continue;
97+
if ($i == 0x2D) // '-'... this will be interpreted as ending the Base64 section
98+
continue;
99+
identifyInvalidString("&" . substr($testString, 0, 11) . chr($i) . "-", 'UTF7-IMAP');
100+
}
101+
echo "Identification fails when Base64 sections contain non-Base64 bytes... right!\n";
102+
103+
/* Tell me, please, how many ways can UTF-16BE text get messed up?
104+
* Why, that's elementary... */
105+
106+
/* 1. The second half of a surrogate pair could come first, */
107+
$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
108+
if (strlen($testString) != 4)
109+
die("Ouch!");
110+
$testString = substr($testString, 2, 2) . substr($testString, 0, 2);
111+
identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
112+
113+
/* ...and we should detect this wherever it occurs */
114+
$singleChar = mb_convert_encoding("", 'UTF-16BE', 'ASCII');
115+
$doubleChar = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE');
116+
if (strlen($doubleChar) != 4)
117+
die("That was supposed to be a surrogate pair");
118+
identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP');
119+
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
120+
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
121+
identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP');
122+
identifyInvalidString("&" . mBase64($singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP');
123+
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $doubleChar . $testString) . "-", 'UTF7-IMAP');
124+
125+
/* 2. The first half of a surrogate pair might be followed by an invalid 2nd part, */
126+
$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
127+
$testString = substr($testString, 0, 2) . mb_convert_encoding("a", 'UTF-16BE', 'ASCII');
128+
identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
129+
130+
/* ...and we should also detect that wherever it occurs... */
131+
identifyInvalidString("&" . mBase64($singleChar . $testString) . "-", 'UTF7-IMAP');
132+
identifyInvalidString("&" . mBase64($singleChar . $singleChar . $testString) . "-", 'UTF7-IMAP');
133+
identifyInvalidString("&" . mBase64($doubleChar . $testString) . "-", 'UTF7-IMAP');
134+
135+
/* 3. The first half of a surrogate pair could come at the end of the string, */
136+
$testString = mb_convert_encoding("\x00\x01\x04\x00", 'UTF-16BE', 'UTF-32BE');
137+
identifyInvalidString("&" . mBase64(substr($testString, 0, 2)) . "-", 'UTF7-IMAP');
138+
identifyInvalidString("&" . mBase64($singleChar . substr($testString, 0, 2)) . "-", 'UTF7-IMAP');
139+
identifyInvalidString("&" . mBase64($singleChar . $singleChar . substr($testString, 0, 2)) . "-", 'UTF7-IMAP');
140+
141+
/* 4. Or, it could have an odd number of bytes in it! */
142+
$testString = utf16BE("ドーナツ");
143+
$testString = substr($testString, 0, strlen($testString) - 1);
144+
identifyInvalidString("&" . mBase64($testString) . "-", 'UTF7-IMAP');
145+
146+
/* And there is one bonus way to discombobulate your UTF-16BE when it is Base64-encoded...
147+
* The Base64 might not decode to an integral number of bytes
148+
* Or, equivalently... it might not be padded with zeroes (as the RFC requires) */
149+
$testString = utf16BE("☺⛑");
150+
if (strlen($testString) != 4)
151+
die("No good");
152+
$encoded = mBase64($testString);
153+
if (strlen($encoded) != 6)
154+
die("Don't like that");
155+
/* Mess up the padding by replacing the last Base64 character with ',',
156+
* which represents 63 (a number with a 1 in the last bit) */
157+
identifyInvalidString("&" . substr($encoded, 0, strlen($encoded) - 1) . ",-", 'UTF7-IMAP');
158+
159+
echo "Identification fails when UTF-16 text is invalid... no sweat!\n";
160+
161+
/* OK, let's try valid Base64-encoded text now */
162+
163+
/* 2-byte char */
164+
testValid("&" . mBase64(utf16BE("")) . "-", "");
165+
/* 2 + 2 */
166+
testValid("&" . mBase64(utf16BE("饺子")) . "-", "饺子");
167+
/* 2 + 2 + 2 */
168+
testValid("&" . mBase64(utf16BE("123")) . "-", "123");
169+
/* 2 + 2 + 2 + 2 */
170+
testValid("&" . mBase64(utf16BE("ᄚᄆᄇᄈ")) . "-", "ᄚᄆᄇᄈ");
171+
/* 4 */
172+
$longChar1 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-16BE', 'UTF-32BE');
173+
$longChar2 = mb_convert_encoding("\x00\x01\x04\x01", 'UTF-8', 'UTF-32BE');
174+
testValid("&" . mBase64($longChar1) . "-", $longChar2);
175+
/* 2 + 4 */
176+
testValid("&" . mBase64(utf16BE("") . $longChar1) . "-", "" . $longChar2);
177+
/* 4 + 2 */
178+
testValid("&" . mBase64($longChar1 . utf16BE("")) . "-", $longChar2 . "");
179+
/* 2 + 4 + 2 */
180+
testValid("&" . mBase64(utf16BE("") . $longChar1 . utf16BE("")) . "-", "" . $longChar2 . "");
181+
/* 2 + 2 + 4 */
182+
testValid("&" . mBase64(utf16BE("西瓜") . $longChar1) . "-", "西瓜" . $longChar2);
183+
/* 2 + 2 + 4 + 2 */
184+
testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . utf16BE("")) . "-", "西瓜" . $longChar2 . "");
185+
/* 2 + 2 + 4 + 4 */
186+
testValid("&" . mBase64(utf16BE("西瓜") . $longChar1 . $longChar1) . "-", "西瓜" . $longChar2 . $longChar2);
187+
/* 2 + 2 + 2 + 4 */
188+
testValid("&" . mBase64(utf16BE("西红柿") . $longChar1) . "-", "西红柿" . $longChar2);
189+
190+
/* Multiple sections of valid ASCII _and_ Base64-encoded text */
191+
testValid("123&" . mBase64(utf16BE("123")) . "-abc&" . mBase64(utf16BE("")) . "-.", "123123abc☺.");
192+
193+
echo "Identification and conversion of valid text is working... perfect!\n";
194+
195+
?>
196+
--EXPECT--
197+
Identification passes on empty string... good start!
198+
Testing all valid single-character ASCII strings... check!
199+
Non-ASCII characters convert to illegal char marker... yes!
200+
& can be Base64-encoded... yes!
201+
Testing unterminated & sections... yep!
202+
Testing ASCII characters which are Base64-encoded... great!
203+
Testing valid strings which use '&-' for '&'... good!
204+
Identification fails when Base64 sections contain non-Base64 bytes... right!
205+
Identification fails when UTF-16 text is invalid... no sweat!
206+
Identification and conversion of valid text is working... perfect!

0 commit comments

Comments
 (0)