-
Notifications
You must be signed in to change notification settings - Fork 7.9k
Improve browscap get_browser performance #12651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ea690a8
30f4c4d
6efb9f3
565ff0a
28e8d1f
ebaa78e
dcaaf2c
5ff4bfb
efab4ef
1dffacf
e9abd10
a89b8e6
f900cdb
6b3ed9b
a903d57
0fdba30
3afc13c
970ab32
7e3f0b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -152,20 +152,16 @@ static zend_string *browscap_convert_pattern(zend_string *pattern, int persisten | |
size_t i, j=0; | ||
char *t; | ||
zend_string *res; | ||
char *lc_pattern; | ||
ALLOCA_FLAG(use_heap); | ||
|
||
res = zend_string_alloc(browscap_compute_regex_len(pattern), persistent); | ||
t = ZSTR_VAL(res); | ||
|
||
lc_pattern = do_alloca(ZSTR_LEN(pattern) + 1, use_heap); | ||
zend_str_tolower_copy(lc_pattern, ZSTR_VAL(pattern), ZSTR_LEN(pattern)); | ||
|
||
t[j++] = '~'; | ||
t[j++] = '^'; | ||
|
||
for (i = 0; i < ZSTR_LEN(pattern); i++, j++) { | ||
switch (lc_pattern[i]) { | ||
char c = ZSTR_VAL(pattern)[i]; | ||
switch (c) { | ||
case '?': | ||
t[j] = '.'; | ||
break; | ||
|
@@ -198,7 +194,7 @@ static zend_string *browscap_convert_pattern(zend_string *pattern, int persisten | |
t[j] = '+'; | ||
break; | ||
default: | ||
t[j] = lc_pattern[i]; | ||
t[j] = zend_tolower_ascii(c); | ||
break; | ||
} | ||
} | ||
|
@@ -208,7 +204,6 @@ static zend_string *browscap_convert_pattern(zend_string *pattern, int persisten | |
t[j]=0; | ||
|
||
ZSTR_LEN(res) = j; | ||
free_alloca(lc_pattern, use_heap); | ||
return res; | ||
} | ||
/* }}} */ | ||
|
@@ -272,27 +267,39 @@ static void browscap_add_kv( | |
bdata->kv_used++; | ||
} | ||
|
||
static void browscap_entry_add_kv_to_existing_array(browser_data *bdata, browscap_entry *entry, HashTable *ht) { | ||
for (uint32_t i = entry->kv_start; i < entry->kv_end; i++) { | ||
zval tmp; | ||
ZVAL_STR_COPY(&tmp, bdata->kv[i].value); | ||
zend_hash_add(ht, bdata->kv[i].key, &tmp); | ||
} | ||
} | ||
|
||
static HashTable *browscap_entry_to_array(browser_data *bdata, browscap_entry *entry) { | ||
zval tmp; | ||
uint32_t i; | ||
|
||
HashTable *ht = zend_new_array(8); | ||
HashTable *ht = zend_new_array(2 + (entry->parent ? 1 : 0) + (entry->kv_end - entry->kv_start)); | ||
|
||
ZVAL_STR(&tmp, browscap_convert_pattern(entry->pattern, 0)); | ||
zend_hash_str_add(ht, "browser_name_regex", sizeof("browser_name_regex")-1, &tmp); | ||
zend_string *key = ZSTR_INIT_LITERAL("browser_name_regex", 0); | ||
ZSTR_H(key) = zend_inline_hash_func("browser_name_regex", sizeof("browser_name_regex")-1); | ||
zend_hash_add_new(ht, key, &tmp); | ||
zend_string_release_ex(key, false); | ||
|
||
ZVAL_STR_COPY(&tmp, entry->pattern); | ||
zend_hash_str_add(ht, "browser_name_pattern", sizeof("browser_name_pattern")-1, &tmp); | ||
key = ZSTR_INIT_LITERAL("browser_name_pattern", 0); | ||
ZSTR_H(key) = zend_inline_hash_func("browser_name_pattern", sizeof("browser_name_pattern")-1); | ||
zend_hash_add_new(ht, key, &tmp); | ||
zend_string_release_ex(key, false); | ||
|
||
if (entry->parent) { | ||
ZVAL_STR_COPY(&tmp, entry->parent); | ||
zend_hash_str_add(ht, "parent", sizeof("parent")-1, &tmp); | ||
key = ZSTR_INIT_LITERAL("parent", 0); | ||
ZSTR_H(key) = zend_inline_hash_func("parent", sizeof("parent")-1); | ||
zend_hash_add_new(ht, key, &tmp); | ||
zend_string_release_ex(key, false); | ||
} | ||
|
||
for (i = entry->kv_start; i < entry->kv_end; i++) { | ||
ZVAL_STR_COPY(&tmp, bdata->kv[i].value); | ||
zend_hash_add(ht, bdata->kv[i].key, &tmp); | ||
} | ||
browscap_entry_add_kv_to_existing_array(bdata, entry, ht); | ||
|
||
return ht; | ||
} | ||
|
@@ -544,31 +551,89 @@ static inline size_t browscap_get_minimum_length(browscap_entry *entry) { | |
return len; | ||
} | ||
|
||
static int browser_reg_compare(browscap_entry *entry, zend_string *agent_name, browscap_entry **found_entry_ptr) /* {{{ */ | ||
static bool browscap_match_string_wildcard(const char *s, const char *s_end, const char *pattern, const char *pattern_end) | ||
{ | ||
browscap_entry *found_entry = *found_entry_ptr; | ||
ALLOCA_FLAG(use_heap) | ||
zend_string *pattern_lc, *regex; | ||
const char *cur; | ||
int i; | ||
const char *pattern_current = pattern; | ||
const char *s_current = s; | ||
|
||
const char *wildcard_pattern_restore_pos = NULL; | ||
const char *wildcard_s_restore_pos = NULL; | ||
|
||
while (s_current < s_end) { | ||
char pattern_char = *pattern_current; | ||
char s_char = *s_current; | ||
|
||
if (pattern_char == '*') { | ||
/* Collapse wildcards */ | ||
pattern_current++; | ||
while (pattern_current < pattern_end && *pattern_current == '*') { | ||
pattern_current++; | ||
} | ||
|
||
pcre2_code *re; | ||
pcre2_match_data *match_data; | ||
uint32_t capture_count; | ||
int rc; | ||
/* If we're at the end of the pattern, it means that the ending was just '*', so this is a trivial match */ | ||
if (pattern_current == pattern_end) { | ||
return true; | ||
} | ||
|
||
/* Optimization: if there is a non-wildcard character X after a *, then we can immediately jump to the first | ||
* character X in s starting from s_current because it is the only way to match beyond the *. */ | ||
if (*pattern_current != '?') { | ||
while (s_current < s_end && *s_current != *pattern_current) { | ||
s_current++; | ||
} | ||
} | ||
|
||
/* We will first assume the skipped part by * is a 0-length string (or n-length if the optimization above skipped n characters). | ||
* When a mismatch happens we will backtrack and move s one position to assume * skipped a 1-length string. | ||
* Then 2, 3, 4, ... */ | ||
wildcard_pattern_restore_pos = pattern_current; | ||
wildcard_s_restore_pos = s_current; | ||
|
||
continue; | ||
} else if (pattern_char == s_char || pattern_char == '?') { | ||
/* Match */ | ||
pattern_current++; | ||
s_current++; | ||
|
||
/* If this was the last character of the pattern, we either fully matched s, or we have a mismatch */ | ||
if (pattern_current == pattern_end) { | ||
if (s_current == s_end) { | ||
return true; | ||
} | ||
/* Fallthrough to mismatch */ | ||
} else { | ||
continue; | ||
} | ||
} | ||
|
||
/* Agent name too short */ | ||
if (ZSTR_LEN(agent_name) < browscap_get_minimum_length(entry)) { | ||
return 0; | ||
/* Mismatch */ | ||
if (wildcard_pattern_restore_pos) { | ||
pattern_current = wildcard_pattern_restore_pos; | ||
wildcard_s_restore_pos++; | ||
s_current = wildcard_s_restore_pos; | ||
} else { | ||
/* No wildcard is active, so it is impossible to match */ | ||
return false; | ||
} | ||
} | ||
|
||
/* Quickly discard patterns where the prefix doesn't match. */ | ||
if (zend_binary_strcasecmp( | ||
ZSTR_VAL(agent_name), entry->prefix_len, | ||
ZSTR_VAL(entry->pattern), entry->prefix_len) != 0) { | ||
return 0; | ||
/* Skip remaining * wildcards, they match nothing here as we are at the end of s */ | ||
while (pattern_current < pattern_end && *pattern_current == '*') { | ||
pattern_current++; | ||
} | ||
|
||
ZEND_ASSERT(s_current == s_end); | ||
return pattern_current == pattern_end; | ||
} | ||
|
||
static int browser_reg_compare(browscap_entry *entry, zend_string *agent_name, browscap_entry **found_entry_ptr, size_t *cached_prev_len) /* {{{ */ | ||
{ | ||
browscap_entry *found_entry = *found_entry_ptr; | ||
ALLOCA_FLAG(use_heap) | ||
zend_string *pattern_lc; | ||
const char *cur; | ||
int i; | ||
|
||
/* Lowercase the pattern, the agent name is already lowercase */ | ||
ZSTR_ALLOCA_ALLOC(pattern_lc, ZSTR_LEN(entry->pattern), use_heap); | ||
zend_str_tolower_copy(ZSTR_VAL(pattern_lc), ZSTR_VAL(entry->pattern), ZSTR_LEN(entry->pattern)); | ||
|
@@ -592,91 +657,52 @@ static int browser_reg_compare(browscap_entry *entry, zend_string *agent_name, b | |
/* See if we have an exact match, if so, we're done... */ | ||
if (zend_string_equals(agent_name, pattern_lc)) { | ||
*found_entry_ptr = entry; | ||
/* cached_prev_len doesn't matter here because we end the search when an exact match is found. */ | ||
ZSTR_ALLOCA_FREE(pattern_lc, use_heap); | ||
return 1; | ||
} | ||
|
||
regex = browscap_convert_pattern(entry->pattern, 0); | ||
re = pcre_get_compiled_regex(regex, &capture_count); | ||
if (re == NULL) { | ||
ZSTR_ALLOCA_FREE(pattern_lc, use_heap); | ||
zend_string_release(regex); | ||
return 0; | ||
} | ||
|
||
match_data = php_pcre_create_match_data(capture_count, re); | ||
if (!match_data) { | ||
ZSTR_ALLOCA_FREE(pattern_lc, use_heap); | ||
zend_string_release(regex); | ||
return 0; | ||
} | ||
rc = pcre2_match(re, (PCRE2_SPTR)ZSTR_VAL(agent_name), ZSTR_LEN(agent_name), 0, 0, match_data, php_pcre_mctx()); | ||
php_pcre_free_match_data(match_data); | ||
if (rc >= 0) { | ||
if (browscap_match_string_wildcard( | ||
ZSTR_VAL(agent_name) + entry->prefix_len, | ||
ZSTR_VAL(agent_name) + ZSTR_LEN(agent_name), | ||
ZSTR_VAL(pattern_lc) + entry->prefix_len, | ||
ZSTR_VAL(pattern_lc) + ZSTR_LEN(pattern_lc) | ||
)) { | ||
/* If we've found a possible browser, we need to do a comparison of the | ||
number of characters changed in the user agent being checked versus | ||
the previous match found and the current match. */ | ||
if (found_entry) { | ||
size_t i, prev_len = 0, curr_len = 0; | ||
zend_string *previous_match = found_entry->pattern; | ||
zend_string *current_match = entry->pattern; | ||
|
||
for (i = 0; i < ZSTR_LEN(previous_match); i++) { | ||
switch (ZSTR_VAL(previous_match)[i]) { | ||
case '?': | ||
case '*': | ||
/* do nothing, ignore these characters in the count */ | ||
break; | ||
|
||
default: | ||
++prev_len; | ||
} | ||
} | ||
|
||
for (i = 0; i < ZSTR_LEN(current_match); i++) { | ||
switch (ZSTR_VAL(current_match)[i]) { | ||
case '?': | ||
case '*': | ||
/* do nothing, ignore these characters in the count */ | ||
break; | ||
size_t curr_len = entry->prefix_len; /* Start from the prefix because the prefix is free of wildcards */ | ||
zend_string *current_match = entry->pattern; | ||
for (size_t i = curr_len; i < ZSTR_LEN(current_match); i++) { | ||
switch (ZSTR_VAL(current_match)[i]) { | ||
case '?': | ||
case '*': | ||
/* do nothing, ignore these characters in the count */ | ||
break; | ||
|
||
default: | ||
++curr_len; | ||
} | ||
default: | ||
++curr_len; | ||
} | ||
} | ||
|
||
if (found_entry) { | ||
/* Pick which browser pattern replaces the least amount of | ||
characters when compared to the original user agent string... */ | ||
if (prev_len < curr_len) { | ||
if (*cached_prev_len < curr_len) { | ||
*found_entry_ptr = entry; | ||
*cached_prev_len = curr_len; | ||
} | ||
} else { | ||
*found_entry_ptr = entry; | ||
*cached_prev_len = curr_len; | ||
} | ||
} | ||
|
||
ZSTR_ALLOCA_FREE(pattern_lc, use_heap); | ||
zend_string_release(regex); | ||
return 0; | ||
} | ||
/* }}} */ | ||
|
||
static void browscap_zval_copy_ctor(zval *p) /* {{{ */ | ||
{ | ||
if (Z_REFCOUNTED_P(p)) { | ||
zend_string *str; | ||
|
||
ZEND_ASSERT(Z_TYPE_P(p) == IS_STRING); | ||
str = Z_STR_P(p); | ||
if (!(GC_FLAGS(str) & GC_PERSISTENT)) { | ||
GC_ADDREF(str); | ||
} else { | ||
ZVAL_NEW_STR(p, zend_string_init(ZSTR_VAL(str), ZSTR_LEN(str), 0)); | ||
} | ||
} | ||
} | ||
/* }}} */ | ||
|
||
/* {{{ Get information about the capabilities of a browser. If browser_name is omitted or null, HTTP_USER_AGENT is used. Returns an object by default; if return_array is true, returns an array. */ | ||
PHP_FUNCTION(get_browser) | ||
{ | ||
|
@@ -726,9 +752,31 @@ PHP_FUNCTION(get_browser) | |
found_entry = zend_hash_find_ptr(bdata->htab, lookup_browser_name); | ||
if (found_entry == NULL) { | ||
browscap_entry *entry; | ||
size_t cached_prev_len = 0; /* silence compiler warning */ | ||
|
||
ZEND_HASH_MAP_FOREACH_PTR(bdata->htab, entry) { | ||
/* The following two early-skip checks are inside this loop instead of inside browser_reg_compare(). | ||
* That's because we want to avoid the call frame overhead, especially as browser_reg_compare() is | ||
* a function that uses alloca(). */ | ||
|
||
/* Agent name too short */ | ||
if (ZSTR_LEN(lookup_browser_name) < browscap_get_minimum_length(entry)) { | ||
continue; | ||
} | ||
|
||
/* Quickly discard patterns where the prefix doesn't match. */ | ||
bool prefix_matches = true; | ||
for (size_t i = 0; i < entry->prefix_len; i++) { | ||
if (ZSTR_VAL(lookup_browser_name)[i] != zend_tolower_ascii(ZSTR_VAL(entry->pattern)[i])) { | ||
prefix_matches = false; | ||
break; | ||
} | ||
} | ||
if (!prefix_matches) { | ||
continue; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment explaning why this is not in the comparison function so people know why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, good point. Otherwise someone else in the future may move them again. |
||
|
||
ZEND_HASH_FOREACH_PTR(bdata->htab, entry) { | ||
if (browser_reg_compare(entry, lookup_browser_name, &found_entry)) { | ||
if (browser_reg_compare(entry, lookup_browser_name, &found_entry, &cached_prev_len)) { | ||
break; | ||
} | ||
} ZEND_HASH_FOREACH_END(); | ||
|
@@ -737,12 +785,14 @@ PHP_FUNCTION(get_browser) | |
found_entry = zend_hash_str_find_ptr(bdata->htab, | ||
DEFAULT_SECTION_NAME, sizeof(DEFAULT_SECTION_NAME)-1); | ||
if (found_entry == NULL) { | ||
zend_string_release(lookup_browser_name); | ||
zend_string_release_ex(lookup_browser_name, false); | ||
RETURN_FALSE; | ||
} | ||
} | ||
} | ||
|
||
zend_string_release_ex(lookup_browser_name, false); | ||
|
||
agent_ht = browscap_entry_to_array(bdata, found_entry); | ||
|
||
if (return_array) { | ||
|
@@ -751,23 +801,15 @@ PHP_FUNCTION(get_browser) | |
object_and_properties_init(return_value, zend_standard_class_def, agent_ht); | ||
} | ||
|
||
HashTable *target_ht = return_array ? Z_ARRVAL_P(return_value) : Z_OBJPROP_P(return_value); | ||
|
||
while (found_entry->parent) { | ||
found_entry = zend_hash_find_ptr(bdata->htab, found_entry->parent); | ||
if (found_entry == NULL) { | ||
break; | ||
} | ||
|
||
agent_ht = browscap_entry_to_array(bdata, found_entry); | ||
if (return_array) { | ||
zend_hash_merge(Z_ARRVAL_P(return_value), agent_ht, (copy_ctor_func_t) browscap_zval_copy_ctor, 0); | ||
} else { | ||
zend_hash_merge(Z_OBJPROP_P(return_value), agent_ht, (copy_ctor_func_t) browscap_zval_copy_ctor, 0); | ||
} | ||
|
||
zend_hash_destroy(agent_ht); | ||
efree(agent_ht); | ||
browscap_entry_add_kv_to_existing_array(bdata, found_entry, target_ht); | ||
} | ||
|
||
zend_string_release_ex(lookup_browser_name, 0); | ||
} | ||
/* }}} */ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isn't ZSTR_LEN will call on every iteration
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a macro that just points to the size field of the zend_string struct.