From 70bd29649abe897619a7e494abd7c7344aaabe5c Mon Sep 17 00:00:00 2001 From: Ilija Tovilo Date: Wed, 29 Dec 2021 12:51:18 +0100 Subject: [PATCH 1/2] Optimize stripos/stristr Closes GH-7847 Closes GH-7852 Previously stripos/stristr would lowercase both the haystack and the needle to reuse strpos. The approach in this PR is similar to strpos. memchr is highly optimized so we're using it to search for the first character of the needle in the haystack. If we find it we compare the remaining characters of the needle manually. The new implementation seems to perform about half as well as strpos (as two memchr calls are necessary to find the next candidate). --- NEWS | 2 ++ Zend/zend_operators.h | 61 +++++++++++++++++++++++++++++++++++++++ ext/standard/php_string.h | 1 + ext/standard/string.c | 32 +++++++------------- main/php.h | 1 + 5 files changed, 75 insertions(+), 22 deletions(-) diff --git a/NEWS b/NEWS index 0a53434c36a9f..09da40c962e28 100644 --- a/NEWS +++ b/NEWS @@ -20,6 +20,8 @@ PHP NEWS . net_get_interfaces() also reports wireless network interfaces on Windows. (Yurun) . Finished AVIF support in getimagesize(). (Yannis Guyon) + . Fixed bug GH-7847 (stripos with large haystack has bad performance). + (ilutov) - Zip: . add ZipArchive::clearError() method diff --git a/Zend/zend_operators.h b/Zend/zend_operators.h index eb88eedb79da1..48418ea0b44aa 100644 --- a/Zend/zend_operators.h +++ b/Zend/zend_operators.h @@ -919,6 +919,67 @@ static zend_always_inline void zend_unwrap_reference(zval *op) /* {{{ */ } /* }}} */ +static zend_always_inline bool zend_strnieq(const char *ptr1, const char *ptr2, size_t num) +{ + const char *end = ptr1 + num; + while (ptr1 < end) { + if (zend_tolower_ascii(*ptr1++) != zend_tolower_ascii(*ptr2++)) { + return 0; + } + } + return 1; +} + +static zend_always_inline const char * +zend_memnistr(const char *haystack, const char *needle, size_t needle_len, const char *end) +{ + ZEND_ASSERT(end >= haystack); + + if (UNEXPECTED(needle_len > (end - haystack))) { + return NULL; + } + + if (UNEXPECTED(needle_len == 0)) { + return haystack; + } + + const char first_lower = zend_tolower_ascii(*needle); + const char first_upper = zend_toupper_ascii(*needle); + const char *p_lower = (const char *)memchr(haystack, first_lower, end - haystack); + const char *p_upper = NULL; + if (first_lower != first_upper) { + // If the needle length is 1 we don't need to look beyond p_lower as it is a guaranteed match + size_t upper_search_length = end - (needle_len == 1 && p_lower != NULL ? p_lower : haystack); + p_upper = (const char *)memchr(haystack, first_upper, upper_search_length); + } + const char *p = !p_upper || (p_lower && p_lower < p_upper) ? p_lower : p_upper; + + if (needle_len == 1) { + return p; + } + + const char needle_end_lower = zend_tolower_ascii(needle[needle_len - 1]); + const char needle_end_upper = zend_toupper_ascii(needle[needle_len - 1]); + end -= needle_len; + + while (p && p <= end) { + if (needle_end_lower == p[needle_len - 1] || needle_end_upper == p[needle_len - 1]) { + if (zend_strnieq(needle + 1, p + 1, needle_len - 2)) { + return p; + } + } + if (p_lower == p) { + p_lower = (const char *)memchr(p_lower + 1, first_lower, end - p_lower); + } + if (p_upper == p) { + p_upper = (const char *)memchr(p_upper + 1, first_upper, end - p_upper); + } + p = !p_upper || (p_lower && p_lower < p_upper) ? p_lower : p_upper; + } + + return NULL; +} + END_EXTERN_C() diff --git a/ext/standard/php_string.h b/ext/standard/php_string.h index f795e4e2b69f6..d2ba939a2dc20 100644 --- a/ext/standard/php_string.h +++ b/ext/standard/php_string.h @@ -48,6 +48,7 @@ PHPAPI void php_stripcslashes(zend_string *str); PHPAPI zend_string *php_basename(const char *s, size_t len, const char *suffix, size_t sufflen); PHPAPI size_t php_dirname(char *str, size_t len); PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len); +PHPAPI const char *php_stristr_no_tolower(const char *s, const char *t, size_t s_len, size_t t_len); PHPAPI zend_string *php_str_to_str(const char *haystack, size_t length, const char *needle, size_t needle_len, const char *str, size_t str_len); PHPAPI zend_string *php_trim(zend_string *str, const char *what, size_t what_len, int mode); diff --git a/ext/standard/string.c b/ext/standard/string.c index 7c2dce4595b0f..ae1d0a2963e58 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -1693,6 +1693,11 @@ PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len) } /* }}} */ +PHPAPI const char *php_stristr_no_tolower(const char *s, const char *t, size_t s_len, size_t t_len) +{ + return (const char*)php_memnistr(s, t, t_len, s + s_len); +} + /* {{{ php_strspn */ PHPAPI size_t php_strspn(const char *s1, const char *s2, const char *s1_end, const char *s2_end) { @@ -1735,8 +1740,6 @@ PHP_FUNCTION(stristr) zend_string *haystack, *needle; const char *found = NULL; size_t found_offset; - char *haystack_dup; - char *orig_needle; bool part = 0; ZEND_PARSE_PARAMETERS_START(2, 3) @@ -1746,13 +1749,10 @@ PHP_FUNCTION(stristr) Z_PARAM_BOOL(part) ZEND_PARSE_PARAMETERS_END(); - haystack_dup = estrndup(ZSTR_VAL(haystack), ZSTR_LEN(haystack)); - orig_needle = estrndup(ZSTR_VAL(needle), ZSTR_LEN(needle)); - found = php_stristr(haystack_dup, orig_needle, ZSTR_LEN(haystack), ZSTR_LEN(needle)); - efree(orig_needle); + found = php_stristr_no_tolower(ZSTR_VAL(haystack), ZSTR_VAL(needle), ZSTR_LEN(haystack), ZSTR_LEN(needle)); if (found) { - found_offset = found - haystack_dup; + found_offset = found - ZSTR_VAL(haystack); if (part) { RETVAL_STRINGL(ZSTR_VAL(haystack), found_offset); } else { @@ -1761,8 +1761,6 @@ PHP_FUNCTION(stristr) } else { RETVAL_FALSE; } - - efree(haystack_dup); } /* }}} */ @@ -1890,7 +1888,6 @@ PHP_FUNCTION(stripos) const char *found = NULL; zend_string *haystack, *needle; zend_long offset = 0; - zend_string *needle_dup = NULL, *haystack_dup; ZEND_PARSE_PARAMETERS_START(2, 3) Z_PARAM_STR(haystack) @@ -1907,23 +1904,14 @@ PHP_FUNCTION(stripos) RETURN_THROWS(); } - if (ZSTR_LEN(needle) > ZSTR_LEN(haystack)) { - RETURN_FALSE; - } - - haystack_dup = zend_string_tolower(haystack); - needle_dup = zend_string_tolower(needle); - found = (char*)php_memnstr(ZSTR_VAL(haystack_dup) + offset, - ZSTR_VAL(needle_dup), ZSTR_LEN(needle_dup), ZSTR_VAL(haystack_dup) + ZSTR_LEN(haystack)); + found = (char*)php_memnistr(ZSTR_VAL(haystack) + offset, + ZSTR_VAL(needle), ZSTR_LEN(needle), ZSTR_VAL(haystack) + ZSTR_LEN(haystack)); if (found) { - RETVAL_LONG(found - ZSTR_VAL(haystack_dup)); + RETVAL_LONG(found - ZSTR_VAL(haystack)); } else { RETVAL_FALSE; } - - zend_string_release_ex(haystack_dup, 0); - zend_string_release_ex(needle_dup, 0); } /* }}} */ diff --git a/main/php.h b/main/php.h index 91ecbb21dc726..e8fcea5b6f607 100644 --- a/main/php.h +++ b/main/php.h @@ -344,6 +344,7 @@ END_EXTERN_C() #define phpin zendin #define php_memnstr zend_memnstr +#define php_memnistr zend_memnistr /* functions */ BEGIN_EXTERN_C() From 1b800b93a7dbfeae27843a9473b463bb89b0dca4 Mon Sep 17 00:00:00 2001 From: Ilija Tovilo Date: Thu, 27 Jan 2022 22:51:26 +0100 Subject: [PATCH 2/2] Alter php_stristr() instead of creating new function --- UPGRADING.INTERNALS | 3 +++ ext/libxml/libxml.c | 5 ++--- ext/phar/phar.c | 8 ++------ ext/phar/tar.c | 9 +++------ ext/phar/zip.c | 8 ++------ ext/standard/php_string.h | 1 - ext/standard/string.c | 11 ++--------- 7 files changed, 14 insertions(+), 31 deletions(-) diff --git a/UPGRADING.INTERNALS b/UPGRADING.INTERNALS index 7175f0b54b4db..cec9356406556 100644 --- a/UPGRADING.INTERNALS +++ b/UPGRADING.INTERNALS @@ -17,6 +17,9 @@ PHP 8.2 INTERNALS UPGRADE NOTES zend_binary_str(n)casecmp() as one would expect. Call the appropriate wrapped function directly instead. * Removed the (ZEND_)WRONG_PARAM_COUNT_WITH_RETVAL() macros. +* php_stristr() no longer lowercases the haystack and needle as a side effect. + Call zend_str_tolower() yourself if necessary. You no longer need to copy + the haystack and needle before passing them to php_stristr(). ======================== 2. Build system changes diff --git a/ext/libxml/libxml.c b/ext/libxml/libxml.c index 449bf3248313e..99d5696ebcf31 100644 --- a/ext/libxml/libxml.c +++ b/ext/libxml/libxml.c @@ -378,9 +378,9 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc) const char buf[] = "Content-Type:"; if (Z_TYPE_P(header) == IS_STRING && !zend_binary_strncasecmp(Z_STRVAL_P(header), Z_STRLEN_P(header), buf, sizeof(buf)-1, sizeof(buf)-1)) { - char *needle = estrdup("charset="); + char needle[] = "charset="; char *haystack = estrndup(Z_STRVAL_P(header), Z_STRLEN_P(header)); - char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), sizeof("charset=")-1); + char *encoding = php_stristr(haystack, needle, Z_STRLEN_P(header), strlen(needle)); if (encoding) { char *end; @@ -408,7 +408,6 @@ php_libxml_input_buffer_create_filename(const char *URI, xmlCharEncoding enc) } } efree(haystack); - efree(needle); break; /* found content-type */ } } ZEND_HASH_FOREACH_END(); diff --git a/ext/phar/phar.c b/ext/phar/phar.c index 91310c21e4967..1ffa9a9d4fdd4 100644 --- a/ext/phar/phar.c +++ b/ext/phar/phar.c @@ -2531,7 +2531,6 @@ int phar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int conv { char halt_stub[] = "__HALT_COMPILER();"; zend_string *newstub; - char *tmp; phar_entry_info *entry, *newentry; size_t halt_offset; int restore_alias_len, global_flags = 0, closeoldfile; @@ -2635,9 +2634,7 @@ int phar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int conv } else { free_user_stub = 0; } - tmp = estrndup(user_stub, len); - if ((pos = php_stristr(tmp, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) { - efree(tmp); + if ((pos = php_stristr(user_stub, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) { if (closeoldfile) { php_stream_close(oldfile); } @@ -2650,8 +2647,7 @@ int phar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int conv } return EOF; } - pos = user_stub + (pos - tmp); - efree(tmp); + pos = user_stub + (pos - user_stub); len = pos - user_stub + 18; if ((size_t)len != php_stream_write(newfile, user_stub, len) || 5 != php_stream_write(newfile, " ?>\r\n", 5)) { diff --git a/ext/phar/tar.c b/ext/phar/tar.c index 07b784a0b9df2..189ee16aa1def 100644 --- a/ext/phar/tar.c +++ b/ext/phar/tar.c @@ -967,7 +967,7 @@ int phar_tar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int int closeoldfile, free_user_stub; size_t signature_length; struct _phar_pass_tar_info pass; - char *buf, *signature, *tmp, sigbuf[8]; + char *buf, *signature, sigbuf[8]; char halt_stub[] = "__HALT_COMPILER();"; entry.flags = PHAR_ENT_PERM_DEF_FILE; @@ -1063,9 +1063,7 @@ int phar_tar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int free_user_stub = 0; } - tmp = estrndup(user_stub, len); - if ((pos = php_stristr(tmp, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) { - efree(tmp); + if ((pos = php_stristr(user_stub, halt_stub, len, sizeof(halt_stub) - 1)) == NULL) { if (error) { spprintf(error, 0, "illegal stub for tar-based phar \"%s\"", phar->fname); } @@ -1074,8 +1072,7 @@ int phar_tar_flush(phar_archive_data *phar, char *user_stub, zend_long len, int } return EOF; } - pos = user_stub + (pos - tmp); - efree(tmp); + pos = user_stub + (pos - user_stub); len = pos - user_stub + 18; entry.fp = php_stream_fopen_tmpfile(); diff --git a/ext/phar/zip.c b/ext/phar/zip.c index c5e38cabf7b87..e3beed33c3e63 100644 --- a/ext/phar/zip.c +++ b/ext/phar/zip.c @@ -1202,7 +1202,6 @@ int phar_zip_flush(phar_archive_data *phar, char *user_stub, zend_long len, int char *pos; static const char newstub[] = "fname); } @@ -1316,8 +1313,7 @@ int phar_zip_flush(phar_archive_data *phar, char *user_stub, zend_long len, int } return EOF; } - pos = user_stub + (pos - tmp); - efree(tmp); + pos = user_stub + (pos - user_stub); len = pos - user_stub + 18; entry.fp = php_stream_fopen_tmpfile(); diff --git a/ext/standard/php_string.h b/ext/standard/php_string.h index d2ba939a2dc20..f795e4e2b69f6 100644 --- a/ext/standard/php_string.h +++ b/ext/standard/php_string.h @@ -48,7 +48,6 @@ PHPAPI void php_stripcslashes(zend_string *str); PHPAPI zend_string *php_basename(const char *s, size_t len, const char *suffix, size_t sufflen); PHPAPI size_t php_dirname(char *str, size_t len); PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len); -PHPAPI const char *php_stristr_no_tolower(const char *s, const char *t, size_t s_len, size_t t_len); PHPAPI zend_string *php_str_to_str(const char *haystack, size_t length, const char *needle, size_t needle_len, const char *str, size_t str_len); PHPAPI zend_string *php_trim(zend_string *str, const char *what, size_t what_len, int mode); diff --git a/ext/standard/string.c b/ext/standard/string.c index ae1d0a2963e58..36e8c1a737e77 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -1687,17 +1687,10 @@ PHP_FUNCTION(pathinfo) case insensitive strstr */ PHPAPI char *php_stristr(char *s, char *t, size_t s_len, size_t t_len) { - zend_str_tolower(s, s_len); - zend_str_tolower(t, t_len); - return (char*)php_memnstr(s, t, t_len, s + s_len); + return (char*)php_memnistr(s, t, t_len, s + s_len); } /* }}} */ -PHPAPI const char *php_stristr_no_tolower(const char *s, const char *t, size_t s_len, size_t t_len) -{ - return (const char*)php_memnistr(s, t, t_len, s + s_len); -} - /* {{{ php_strspn */ PHPAPI size_t php_strspn(const char *s1, const char *s2, const char *s1_end, const char *s2_end) { @@ -1749,7 +1742,7 @@ PHP_FUNCTION(stristr) Z_PARAM_BOOL(part) ZEND_PARSE_PARAMETERS_END(); - found = php_stristr_no_tolower(ZSTR_VAL(haystack), ZSTR_VAL(needle), ZSTR_LEN(haystack), ZSTR_LEN(needle)); + found = php_stristr(ZSTR_VAL(haystack), ZSTR_VAL(needle), ZSTR_LEN(haystack), ZSTR_LEN(needle)); if (found) { found_offset = found - ZSTR_VAL(haystack);