From 337c0a30afaf4aa35065cdd3fc55c6204b8d0468 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Thu, 16 Mar 2023 22:14:18 +0100 Subject: [PATCH] Fix GH-10634: Lexing memory corruption We're not relying on re2c's bounds checking mechanism because re2c:yyfill:check = 0; is set. We just return 0 if we read over the end of the input in YYFILL. Note that we used to use the "any character" wildcard in the comment regexes. But that means if we go over the end in the comment regexes, we don't know that and it's just like the 0 bytes are part of the token. Since a 0 byte already is considered as an end-of-file, we can just block those in the regex. For the regexes with newlines, I had to not only include \x00 in the denylist, but also \n and \r because otherwise it would greedily match those and let the single-line comment run over multiple lines. --- Zend/tests/gh10634.phpt | 24 ++++++++++++++++++++++++ Zend/zend_language_scanner.l | 10 +++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 Zend/tests/gh10634.phpt diff --git a/Zend/tests/gh10634.phpt b/Zend/tests/gh10634.phpt new file mode 100644 index 0000000000000..41407bf307d7f --- /dev/null +++ b/Zend/tests/gh10634.phpt @@ -0,0 +1,24 @@ +--TEST-- +GH-10634 (Lexing memory corruption) +--FILE-- +getMessage()); + } +} + +test_input("y&/*"); +test_input("y&/**"); +test_input("y&#"); +test_input("y&# "); +test_input("y&//"); +?> +--EXPECT-- +string(36) "Unterminated comment starting line 1" +string(36) "Unterminated comment starting line 1" +string(36) "syntax error, unexpected end of file" +string(36) "syntax error, unexpected end of file" +string(36) "syntax error, unexpected end of file" diff --git a/Zend/zend_language_scanner.l b/Zend/zend_language_scanner.l index 7abd91b23a58a..054ed7bdc1ef6 100644 --- a/Zend/zend_language_scanner.l +++ b/Zend/zend_language_scanner.l @@ -1369,9 +1369,13 @@ TOKENS [;:,.|^&+-/*=%!~$<>?@] ANY_CHAR [^] NEWLINE ("\r"|"\n"|"\r\n") OPTIONAL_WHITESPACE [ \n\r\t]* -MULTI_LINE_COMMENT "/*"([^*]*"*"+)([^*/][^*]*"*"+)*"/" -SINGLE_LINE_COMMENT "//".*[\n\r] -HASH_COMMENT "#"(([^[].*[\n\r])|[\n\r]) +/* We don't use re2c with bounds checking, we just return 0 bytes if we read past the input. + * If we use wildcard matching for comments, we can read past the input, which crashes + * once we try to report a syntax error because the 0 bytes are not actually part of + * the token. We prevent this by not allowing 0 bytes, which already aren't valid anyway. */ +MULTI_LINE_COMMENT "/*"([^*\x00]*"*"+)([^*/\x00][^*\x00]*"*"+)*"/" +SINGLE_LINE_COMMENT "//"[^\x00\n\r]*[\n\r] +HASH_COMMENT "#"(([^[\x00][^\x00\n\r]*[\n\r])|[\n\r]) WHITESPACE_OR_COMMENTS ({WHITESPACE}|{MULTI_LINE_COMMENT}|{SINGLE_LINE_COMMENT}|{HASH_COMMENT})+ OPTIONAL_WHITESPACE_OR_COMMENTS ({WHITESPACE}|{MULTI_LINE_COMMENT}|{SINGLE_LINE_COMMENT}|{HASH_COMMENT})*