Fix buffer overflow bugs in UTF-7 text conversion

alexdowad · alexdowad · commit 8dddd3cfadce · 2022-05-28T21:53:38.000+02:00
After Nikita Popov found a buffer overrun bug in one of my pull requests, I was prompted to add more assertions in a38c7e5 to help me catch such bugs myself more easily in testing. Wouldn't you just know it... as soon as I added those assertions, the mbstring test suite caught another buffer overrun bug in my UTF-7 conversion code, which I wrote the better part of a year ago. Then, when I started fuzzing the code with libfuzzer, I found and fixed another buffer overflow: If we enter the main loop, which normally outputs 3 decoded Base64 characters, where the first half of a surrogate pair had appeared at the end of the previous run, but the second half does not appear on this run, we need to output one error marker. Then, at the end of the main loop, if the Base64 input ends at an unexpected position AND the last character was not a legal Base64-encoded character, we need to output two error markers for that. The three error markers plus two valid, decoded bytes can push us over the available space in our wchar buffer.
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c
@@ -478,7 +478,7 @@ static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t
 
 static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state)
 {
-	ZEND_ASSERT(bufsize >= 4); /* This function will infinite-loop if called with a tiny output buffer */
+	ZEND_ASSERT(bufsize >= 5); /* This function will infinite-loop if called with a tiny output buffer */
 
 	unsigned char *p = *in, *e = p + *in_len;
 	uint32_t *out = buf, *limit = buf + bufsize;
@@ -489,7 +489,7 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
 	while (p < e && out < limit) {
 		if (base64) {
 			/* Base64 section */
-			if ((limit - out) < 4) {
+			if ((limit - out) < 5) {
 				break;
 			}
 
@@ -631,16 +631,19 @@ static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool
 				MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
 				RESTORE_CONVERSION_STATE();
 			} else {
-				/* Encode codepoint, preceded by any cached bits, as Base64 */
+				/* Encode codepoint, preceded by any cached bits, as Base64
+				 * Make enough space in the output buffer to hold both any bytes that
+				 * we emit right here, plus any finishing byte which might need to
+				 * be emitted if the input string ends abruptly */
 				uint64_t bits;
 				if (w >= MBFL_WCSPLANE_SUPMIN) {
 					/* Must use surrogate pair */
-					MB_CONVERT_BUF_ENSURE(buf, out, limit, 6);
+					MB_CONVERT_BUF_ENSURE(buf, out, limit, 7);
 					w -= 0x10000;
 					bits = ((uint64_t)cache << 32) | 0xD800DC00L | ((w & 0xFFC00) << 6) | (w & 0x3FF);
 					nbits += 32;
 				} else {
-					MB_CONVERT_BUF_ENSURE(buf, out, limit, 3);
+					MB_CONVERT_BUF_ENSURE(buf, out, limit, 4);
 					bits = (cache << 16) | w;
 					nbits += 16;
 				}