From a63852fb7e8d3198f53c8452c0e1a8f7b32eb1e6 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:24:04 +0100 Subject: [PATCH 1/3] Fix GH-17122: memory leak in regex Because the subpattern names are persistent, and the fact that the symbol table destruction is skipped when using fast_shutdown, this means the refcounts will not be updated for the destruction of the arrays that hold the subpattern name keys. To solve this, detect this situation and duplicate the strings. --- ext/pcre/php_pcre.c | 50 ++++++++++++++++++++++++++++++++++--- ext/pcre/tests/gh17122.phpt | 30 ++++++++++++++++++++++ 2 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 ext/pcre/tests/gh17122.phpt diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index 19068b90c0d0b..6361a65e1fdb6 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -549,6 +549,22 @@ static zend_string **make_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce } /* }}} */ +/* Because the subpattern names are persistent, and the fact that the + * symbol table destruction is skipped when using fast_shutdown, + * this means the refcounts will not be updated for the destruction of + * the arrays that hold the subpattern name keys. + * To solve this, detect this situation and duplicate the strings. */ +static zend_always_inline bool has_symbol_table_hazard(void) +{ + zend_execute_data *ex = EG(current_execute_data); + if (UNEXPECTED(!ex || !ex->prev_execute_data)) { + return false; + } + /* Note: we may also narrow this to only check if the symbol table equals + * &EG(symbol_table), but this is a cheaper check. */ + return ZEND_CALL_INFO(ex->prev_execute_data) & ZEND_CALL_HAS_SYMBOL_TABLE; +} + /* {{{ static calculate_unit_length */ /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */ static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start) @@ -971,12 +987,26 @@ static zend_always_inline void populate_match_value( static inline void add_named( HashTable *const subpats, zend_string *name, zval *val, bool unmatched) { + ZEND_ASSERT(GC_FLAGS(name) & IS_STR_PERSISTENT); + + bool symbol_table_hazard = has_symbol_table_hazard(); + /* If the DUPNAMES option is used, multiple subpatterns might have the same name. * In this case we want to preserve the one that actually has a value. */ if (!unmatched) { - zend_hash_update(subpats, name, val); + if (EXPECTED(!symbol_table_hazard)) { + zend_hash_update(subpats, name, val); + } else { + zend_hash_str_update(subpats, ZSTR_VAL(name), ZSTR_LEN(name), val); + } } else { - if (!zend_hash_add(subpats, name, val)) { + zval *zv; + if (EXPECTED(!symbol_table_hazard)) { + zv = zend_hash_add(subpats, name, val); + } else { + zv = zend_hash_str_add(subpats, ZSTR_VAL(name), ZSTR_LEN(name), val); + } + if (!zv) { return; } } @@ -1061,10 +1091,16 @@ static void populate_subpat_array( zend_hash_next_index_insert_new(subpats_ht, &val); } if (unmatched_as_null) { + bool symbol_table_hazard = has_symbol_table_hazard(); + for (i = count; i < num_subpats; i++) { ZVAL_NULL(&val); if (subpat_names[i]) { - zend_hash_add(subpats_ht, subpat_names[i], &val); + if (EXPECTED(!symbol_table_hazard)) { + zend_hash_add(subpats_ht, subpat_names[i], &val); + } else { + zend_hash_str_add(subpats_ht, ZSTR_VAL(subpat_names[i]), ZSTR_LEN(subpat_names[i]), &val); + } } zend_hash_next_index_insert_new(subpats_ht, &val); } @@ -1429,11 +1465,17 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, /* Add the match sets to the output array and clean up */ if (match_sets) { if (subpat_names) { + bool symbol_table_hazard = has_symbol_table_hazard(); + for (i = 0; i < num_subpats; i++) { zval wrapper; ZVAL_ARR(&wrapper, match_sets[i]); if (subpat_names[i]) { - zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper); + if (EXPECTED(!symbol_table_hazard)) { + zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper); + } else { + zend_hash_str_update(Z_ARRVAL_P(subpats), ZSTR_VAL(subpat_names[i]), ZSTR_LEN(subpat_names[i]), &wrapper); + } GC_ADDREF(match_sets[i]); } zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper); diff --git a/ext/pcre/tests/gh17122.phpt b/ext/pcre/tests/gh17122.phpt new file mode 100644 index 0000000000000..afccb47b96dab --- /dev/null +++ b/ext/pcre/tests/gh17122.phpt @@ -0,0 +1,30 @@ +--TEST-- +GH-17122 (memory leak in regex) +--FILE-- +)(\d+)|', 0xffffffff, $m1); +var_dump($m1); +\preg_match('|(?P)(\d+)|', 0, $m2); +var_dump($m2); +?> +--EXPECT-- +array(4) { + [0]=> + string(10) "4294967295" + ["name"]=> + string(0) "" + [1]=> + string(0) "" + [2]=> + string(10) "4294967295" +} +array(4) { + [0]=> + string(1) "0" + ["name2"]=> + string(0) "" + [1]=> + string(0) "" + [2]=> + string(1) "0" +} From d9752bd3c693638772722509e74dff679d099974 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Thu, 12 Dec 2024 21:04:00 +0100 Subject: [PATCH 2/3] simpler solution --- ext/pcre/php_pcre.c | 61 ++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 45 deletions(-) diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index 6361a65e1fdb6..73ac159e2c7c7 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -506,10 +506,21 @@ static int pcre_clean_cache(zval *data, void *arg) /* }}} */ static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) { + bool destroy = EG(flags) & EG_FLAGS_IN_SHUTDOWN; + uint32_t i; for (i = 0; i < num_subpats; i++) { if (subpat_names[i]) { - zend_string_release_ex(subpat_names[i], true); + if (destroy) { + /* Because the subpattern names are persistent, and the fact that the + * symbol table destruction is skipped when using fast_shutdown, + * this means the refcounts will not be updated for the destruction of + * the arrays that hold the subpattern name keys. */ + ZEND_ASSERT(!ZSTR_IS_INTERNED(subpat_names[i])); + pefree(subpat_names[i], true); + } else { + zend_string_release_ex(subpat_names[i], true); + } } } pefree(subpat_names, true); @@ -549,22 +560,6 @@ static zend_string **make_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce } /* }}} */ -/* Because the subpattern names are persistent, and the fact that the - * symbol table destruction is skipped when using fast_shutdown, - * this means the refcounts will not be updated for the destruction of - * the arrays that hold the subpattern name keys. - * To solve this, detect this situation and duplicate the strings. */ -static zend_always_inline bool has_symbol_table_hazard(void) -{ - zend_execute_data *ex = EG(current_execute_data); - if (UNEXPECTED(!ex || !ex->prev_execute_data)) { - return false; - } - /* Note: we may also narrow this to only check if the symbol table equals - * &EG(symbol_table), but this is a cheaper check. */ - return ZEND_CALL_INFO(ex->prev_execute_data) & ZEND_CALL_HAS_SYMBOL_TABLE; -} - /* {{{ static calculate_unit_length */ /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */ static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start) @@ -989,24 +984,12 @@ static inline void add_named( HashTable *const subpats, zend_string *name, zval *val, bool unmatched) { ZEND_ASSERT(GC_FLAGS(name) & IS_STR_PERSISTENT); - bool symbol_table_hazard = has_symbol_table_hazard(); - /* If the DUPNAMES option is used, multiple subpatterns might have the same name. * In this case we want to preserve the one that actually has a value. */ if (!unmatched) { - if (EXPECTED(!symbol_table_hazard)) { - zend_hash_update(subpats, name, val); - } else { - zend_hash_str_update(subpats, ZSTR_VAL(name), ZSTR_LEN(name), val); - } + zend_hash_update(subpats, name, val); } else { - zval *zv; - if (EXPECTED(!symbol_table_hazard)) { - zv = zend_hash_add(subpats, name, val); - } else { - zv = zend_hash_str_add(subpats, ZSTR_VAL(name), ZSTR_LEN(name), val); - } - if (!zv) { + if (!zend_hash_add(subpats, name, val)) { return; } } @@ -1091,16 +1074,10 @@ static void populate_subpat_array( zend_hash_next_index_insert_new(subpats_ht, &val); } if (unmatched_as_null) { - bool symbol_table_hazard = has_symbol_table_hazard(); - for (i = count; i < num_subpats; i++) { ZVAL_NULL(&val); if (subpat_names[i]) { - if (EXPECTED(!symbol_table_hazard)) { - zend_hash_add(subpats_ht, subpat_names[i], &val); - } else { - zend_hash_str_add(subpats_ht, ZSTR_VAL(subpat_names[i]), ZSTR_LEN(subpat_names[i]), &val); - } + zend_hash_add(subpats_ht, subpat_names[i], &val); } zend_hash_next_index_insert_new(subpats_ht, &val); } @@ -1465,17 +1442,11 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, /* Add the match sets to the output array and clean up */ if (match_sets) { if (subpat_names) { - bool symbol_table_hazard = has_symbol_table_hazard(); - for (i = 0; i < num_subpats; i++) { zval wrapper; ZVAL_ARR(&wrapper, match_sets[i]); if (subpat_names[i]) { - if (EXPECTED(!symbol_table_hazard)) { - zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper); - } else { - zend_hash_str_update(Z_ARRVAL_P(subpats), ZSTR_VAL(subpat_names[i]), ZSTR_LEN(subpat_names[i]), &wrapper); - } + zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper); GC_ADDREF(match_sets[i]); } zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper); From 7a24bb6d8c1204ba971980529148301384fe2b5c Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Sat, 21 Dec 2024 18:32:08 +0100 Subject: [PATCH 3/3] New new approach --- ext/pcre/php_pcre.c | 87 ++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 45 deletions(-) diff --git a/ext/pcre/php_pcre.c b/ext/pcre/php_pcre.c index 73ac159e2c7c7..5671957daac52 100644 --- a/ext/pcre/php_pcre.c +++ b/ext/pcre/php_pcre.c @@ -49,10 +49,14 @@ char *php_pcre_version; struct _pcre_cache_entry { pcre2_code *re; - /* Pointer is not NULL when there are named captures. - * Length is equal to capture_count + 1 to account for capture group 0. */ + /* Pointer is not NULL (during request) when there are named captures. + * Length is equal to capture_count + 1 to account for capture group 0. + * This table cache is only valid during request. + * Trying to store this over multiple requests causes issues when the keys are exposed in user arrays + * (see GH-17122 and GH-17132). */ zend_string **subpats_table; uint32_t preg_options; + uint32_t name_count; uint32_t capture_count; uint32_t compile_options; uint32_t refcount; @@ -478,6 +482,14 @@ static PHP_RINIT_FUNCTION(pcre) static PHP_RSHUTDOWN_FUNCTION(pcre) { + pcre_cache_entry *pce; + ZEND_HASH_MAP_FOREACH_PTR(&PCRE_G(pcre_cache), pce) { + if (pce->subpats_table) { + free_subpats_table(pce->subpats_table, pce->capture_count + 1); + pce->subpats_table = NULL; + } + } ZEND_HASH_FOREACH_END(); + pcre2_general_context_free(PCRE_G(gctx_zmm)); PCRE_G(gctx_zmm) = NULL; @@ -506,24 +518,13 @@ static int pcre_clean_cache(zval *data, void *arg) /* }}} */ static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) { - bool destroy = EG(flags) & EG_FLAGS_IN_SHUTDOWN; - uint32_t i; for (i = 0; i < num_subpats; i++) { if (subpat_names[i]) { - if (destroy) { - /* Because the subpattern names are persistent, and the fact that the - * symbol table destruction is skipped when using fast_shutdown, - * this means the refcounts will not be updated for the destruction of - * the arrays that hold the subpattern name keys. */ - ZEND_ASSERT(!ZSTR_IS_INTERNED(subpat_names[i])); - pefree(subpat_names[i], true); - } else { - zend_string_release_ex(subpat_names[i], true); - } + zend_string_release_ex(subpat_names[i], false); } } - pefree(subpat_names, true); + efree(subpat_names); } /* {{{ static make_subpats_table */ @@ -542,24 +543,25 @@ static zend_string **make_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce return NULL; } - subpat_names = pecalloc(num_subpats, sizeof(zend_string *), true); + subpat_names = ecalloc(num_subpats, sizeof(zend_string *)); while (ni++ < name_cnt) { unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1]; const char *name = name_table + 2; - /* Note: this makes a persistent string when the cache is not request-based because the string - * has to outlive the request. In that case, they will only be used within this thread - * and never be shared. - * Although we will be storing them in user-exposed arrays, they cannot cause problems - * because they only live in this thread and the last reference is deleted on shutdown - * instead of by user code. */ - subpat_names[name_idx] = zend_string_init(name, strlen(name), true); - GC_MAKE_PERSISTENT_LOCAL(subpat_names[name_idx]); + subpat_names[name_idx] = zend_string_init(name, strlen(name), false); name_table += name_size; } return subpat_names; } /* }}} */ +static zend_string **ensure_subpats_table(uint32_t name_cnt, pcre_cache_entry *pce) +{ + if (!pce->subpats_table) { + pce->subpats_table = make_subpats_table(name_cnt, pce); + } + return pce->subpats_table; +} + /* {{{ static calculate_unit_length */ /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */ static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start) @@ -831,6 +833,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo new_entry.preg_options = poptions; new_entry.compile_options = coptions; new_entry.refcount = 0; + new_entry.subpats_table = NULL; rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count); if (rc < 0) { @@ -842,8 +845,7 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo return NULL; } - uint32_t name_count; - rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &name_count); + rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &new_entry.name_count); if (rc < 0) { if (key != regex) { zend_string_release_ex(key, 0); @@ -853,21 +855,6 @@ PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bo return NULL; } - /* Compute and cache the subpattern table to avoid computing it again over and over. */ - if (name_count > 0) { - new_entry.subpats_table = make_subpats_table(name_count, &new_entry); - if (!new_entry.subpats_table) { - if (key != regex) { - zend_string_release_ex(key, false); - } - /* Warning already emitted by make_subpats_table() */ - pcre_handle_exec_error(PCRE2_ERROR_INTERNAL); - return NULL; - } - } else { - new_entry.subpats_table = NULL; - } - /* * Interned strings are not duplicated when stored in HashTable, * but all the interned strings created during HTTP request are removed @@ -982,7 +969,7 @@ static zend_always_inline void populate_match_value( static inline void add_named( HashTable *const subpats, zend_string *name, zval *val, bool unmatched) { - ZEND_ASSERT(GC_FLAGS(name) & IS_STR_PERSISTENT); + ZEND_ASSERT(!(GC_FLAGS(name) & IS_STR_PERSISTENT)); /* If the DUPNAMES option is used, multiple subpatterns might have the same name. * In this case we want to preserve the one that actually has a value. */ @@ -1245,8 +1232,11 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, * allocate the table only if there are any named subpatterns. */ subpat_names = NULL; - if (subpats) { - subpat_names = pce->subpats_table; + if (subpats && pce->name_count > 0) { + subpat_names = ensure_subpats_table(pce->name_count, pce); + if (UNEXPECTED(!subpat_names)) { + RETURN_FALSE; + } } matched = 0; @@ -1882,7 +1872,14 @@ static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_strin /* Calculate the size of the offsets array, and allocate memory for it. */ num_subpats = pce->capture_count + 1; - subpat_names = pce->subpats_table; + if (pce->name_count > 0) { + subpat_names = ensure_subpats_table(pce->name_count, pce); + if (UNEXPECTED(!subpat_names)) { + return NULL; + } + } else { + subpat_names = NULL; + } alloc_len = 0; result = NULL;