diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 1ca160d4740a6..a976d27913de8 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3166,6 +3166,98 @@ PHP_FUNCTION(mb_rtrim) php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM); } +PHP_FUNCTION(mb_levenshtein) +{ + zend_string *string1, *string2, *enc_name = NULL; + zend_long cost_ins = 1; + zend_long cost_rep = 1; + zend_long cost_del = 1; + + ZEND_PARSE_PARAMETERS_START(2, 6) + Z_PARAM_STR(string1) + Z_PARAM_STR(string2) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(cost_ins) + Z_PARAM_LONG(cost_rep) + Z_PARAM_LONG(cost_del) + Z_PARAM_STR_OR_NULL(enc_name) + ZEND_PARSE_PARAMETERS_END(); + + const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 6); + if (!enc) { + RETURN_THROWS(); + } + + /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means + * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time) + * by having shorter rows (p1 & p2). */ + if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) { + zend_string *tmp = string1; + string1 = string2; + string2 = tmp; + } + + uint32_t wchar_buf_1[1], wchar_buf_2[1]; + size_t i1, i2; + zend_long *p1, *p2, *tmp; + size_t strlen_1 = mb_get_strlen(string1, enc); + size_t strlen_2 = mb_get_strlen(string2, enc); + size_t in_len_1 = ZSTR_LEN(string1); + size_t in_len_2 = ZSTR_LEN(string2); + unsigned char *in_1 = (unsigned char*)ZSTR_VAL(string1); + unsigned char *in_2 = (unsigned char*)ZSTR_VAL(string2); + unsigned int state = 0; + + if (strlen_1 == 0) { + RETURN_LONG(strlen_2 * cost_ins); + } + + if (strlen_2 == 0) { + RETURN_LONG(strlen_1 * cost_ins); + } + + zend_long c0, c1, c2; + + p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); + p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); + + for (i2 = 0; i2 <= strlen_2; i2++) { + p1[i2] = i2 * cost_ins; + } + zend_long tmp_wchar_len; + + for (i1 = 0; i1 < strlen_1; i1++) { + tmp_wchar_len = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 1, &state); + ZEND_ASSERT(tmp_wchar_len <= 1); + p2[0] = p1[0] + cost_del; + for (i2 = 0; i2 < strlen_2; i2++) { + tmp_wchar_len = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 1, &state); + ZEND_ASSERT(tmp_wchar_len <= 1); + c0 = p1[i2] + (wchar_buf_1[0] == wchar_buf_2[0] ? 0 : cost_rep); + c1 = p1[i2 + 1] + cost_del; + if (c1 < c0) { + c0 = c1; + } + c2 = p2[i2] + cost_ins; + if (c2 < c0) { + c0 = c2; + } + p2[i2 + 1] = c0; + } + in_2 = (unsigned char*)ZSTR_VAL(string2); + in_len_2 = ZSTR_LEN(string2); + tmp = p1; + p1 = p2; + p2 = tmp; + } + + c0 = p1[strlen_2]; + efree(p1); + efree(p2); + + RETURN_LONG(c0); +} + static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size) { const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0); diff --git a/ext/mbstring/mbstring.stub.php b/ext/mbstring/mbstring.stub.php index af9c5cbb93ea2..5fe3daac53297 100644 --- a/ext/mbstring/mbstring.stub.php +++ b/ext/mbstring/mbstring.stub.php @@ -145,6 +145,8 @@ function mb_ltrim(string $string, ?string $characters = null, ?string $encoding function mb_rtrim(string $string, ?string $characters = null, ?string $encoding = null): string {} +function mb_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, ?string $encoding = null): int {} + /** @refcount 1 */ function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {} diff --git a/ext/mbstring/mbstring_arginfo.h b/ext/mbstring/mbstring_arginfo.h index 230dddf96941c..681bf4711500c 100644 --- a/ext/mbstring/mbstring_arginfo.h +++ b/ext/mbstring/mbstring_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 03c07f68bea7d7b96e6dc11f180f45663b859ed3 */ + * Stub hash: de316945d5da7430183851eb3ac7dce2fc15b339 */ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_language, 0, 0, MAY_BE_STRING|MAY_BE_BOOL) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, language, IS_STRING, 1, "null") @@ -132,6 +132,15 @@ ZEND_END_ARG_INFO() #define arginfo_mb_rtrim arginfo_mb_trim +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_mb_levenshtein, 0, 2, IS_LONG, 0) + ZEND_ARG_TYPE_INFO(0, string1, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, string2, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, insertion_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, replacement_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, deletion_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, encoding, IS_STRING, 1, "null") +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_detect_encoding, 0, 1, MAY_BE_STRING|MAY_BE_FALSE) ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0) ZEND_ARG_TYPE_MASK(0, encodings, MAY_BE_ARRAY|MAY_BE_STRING|MAY_BE_NULL, "null") @@ -327,6 +336,7 @@ ZEND_FUNCTION(mb_lcfirst); ZEND_FUNCTION(mb_trim); ZEND_FUNCTION(mb_ltrim); ZEND_FUNCTION(mb_rtrim); +ZEND_FUNCTION(mb_levenshtein); ZEND_FUNCTION(mb_detect_encoding); ZEND_FUNCTION(mb_list_encodings); ZEND_FUNCTION(mb_encoding_aliases); @@ -396,6 +406,7 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(mb_trim, arginfo_mb_trim) ZEND_FE(mb_ltrim, arginfo_mb_ltrim) ZEND_FE(mb_rtrim, arginfo_mb_rtrim) + ZEND_FE(mb_levenshtein, arginfo_mb_levenshtein) ZEND_FE(mb_detect_encoding, arginfo_mb_detect_encoding) ZEND_FE(mb_list_encodings, arginfo_mb_list_encodings) ZEND_FE(mb_encoding_aliases, arginfo_mb_encoding_aliases) diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt new file mode 100644 index 0000000000000..7c817b889eb36 --- /dev/null +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -0,0 +1,139 @@ +--TEST-- +mb_levenshtein() function test +--EXTENSIONS-- +mbstring +--FILE-- + +--EXPECT-- +--- Equal --- +int(0) +--- First string empty --- +int(3) +--- Second string empty --- +int(3) +--- Both empty --- +int(0) +int(0) +--- 1 character --- +int(1) +--- 2 character swapped --- +int(2) +--- Inexpensive deletion --- +int(2) +--- Expensive deletion --- +int(10) +--- Inexpensive insertion --- +int(2) +--- Expensive insertion --- +int(10) +--- Expensive replacement --- +int(3) +--- Very expensive replacement --- +int(4) +--- 128 codepoints --- +int(2) +--- 128 codepoints over --- +int(2) +int(256) +--- 128 codepoints over only $string1 --- +int(128) +--- 128 codepoints over only $string2 --- +int(130) +--- 128 codepoints over Hiragana --- +int(2) +--- 128 codepoints over Hiragana in Shift_JIS --- +int(2) +--- café in ISO-8859-1 --- +int(1) +--- Variable selector --- +int(1) +int(1) +--- Usecase of userland code --- +OK +--- Usecase of Emoji --- +int(1) +int(3) diff --git a/ext/mbstring/tests/mb_levenshtein_userland.inc b/ext/mbstring/tests/mb_levenshtein_userland.inc new file mode 100644 index 0000000000000..a0cadce86cc43 --- /dev/null +++ b/ext/mbstring/tests/mb_levenshtein_userland.inc @@ -0,0 +1,69 @@ +