From af72b0bff6ad2f60325fa27ff9e4ae86d565e1e7 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Tue, 24 Sep 2024 17:07:12 +0900 Subject: [PATCH 01/16] [Draft][Require RFC] mb_levenshtein function --- ext/mbstring/mbstring.c | 120 +++++++++++++++++++++++++ ext/mbstring/mbstring.stub.php | 2 + ext/mbstring/mbstring_arginfo.h | 13 ++- ext/mbstring/tests/mb_levenshtein.phpt | 86 ++++++++++++++++++ 4 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 ext/mbstring/tests/mb_levenshtein.phpt diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 1ca160d4740a6..5a2313dc5cd96 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3166,6 +3166,126 @@ PHP_FUNCTION(mb_rtrim) php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM); } +PHP_FUNCTION(mb_levenshtein) +{ + zend_string *string1, *string2, *enc_name = NULL; + zend_long cost_ins = 1; + zend_long cost_rep = 1; + zend_long cost_del = 1; + + ZEND_PARSE_PARAMETERS_START(2, 6) + Z_PARAM_STR(string1) + Z_PARAM_STR(string2) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(cost_ins) + Z_PARAM_LONG(cost_rep) + Z_PARAM_LONG(cost_del) + Z_PARAM_STR_OR_NULL(enc_name) + ZEND_PARSE_PARAMETERS_END(); + + if (ZSTR_LEN(string1) == 0) { + RETVAL_LONG(ZSTR_LEN(string2) * cost_ins); + } + + if (ZSTR_LEN(string2) == 0) { + RETVAL_LONG(ZSTR_LEN(string1) * cost_del); + } + + const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 6); + if (!enc) { + RETURN_THROWS(); + } + + /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means + * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time) + * by having shorter rows (p1 & p2). */ + if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) { + zend_string *tmp = string1; + string1 = string2; + string2 = tmp; + } + + uint32_t wchar_buf_1[128], wchar_buf_2[128]; + size_t i1, i2; + zend_long *p1, *p2, *tmp; + size_t strlen_1 = mb_get_strlen(string1, enc); + size_t strlen_2 = mb_get_strlen(string2, enc); + size_t len_1 = 0; + size_t len_2 = 0; + size_t in_len_1 = ZSTR_LEN(string1); + size_t in_len_2 = ZSTR_LEN(string2); + unsigned char *in_1 = (unsigned char*)ZSTR_VAL(string1); + unsigned char *in_2 = (unsigned char*)ZSTR_VAL(string2); + unsigned int state = 0; + + zend_long c0, c1, c2; + + p1 = safe_emalloc(strlen_1, sizeof(zend_long), 0); + p2 = safe_emalloc(strlen_2, sizeof(zend_long), 0); + + for (i2 = 0; i2 <= strlen_2; i2++) { + p1[i2] = i2 * cost_ins; + } + + zend_long tmp_wchar_len_1 = 0; + zend_long tmp_wchar_len_2 = 0; + bool first = true; + + while (in_len_1) { + tmp_wchar_len_1 = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 128, &state); + len_1 += tmp_wchar_len_1; + ZEND_ASSERT(in_len_1 <= 128); + tmp_wchar_len_2 = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 128, &state); + len_2 += tmp_wchar_len_2; + ZEND_ASSERT(in_len_2 <= 128); + + for (i1 = 0; i1 < tmp_wchar_len_1; i1++) { + /* First loop that does not cross a 128 code points */ + if (first) { + p2[0] = p1[0] + cost_del; + } + /* Insertion process when there is a surplus of 128 code points. */ + if (tmp_wchar_len_2 == 0) { + for (i2 = 0; i2 < tmp_wchar_len_1; i2++) { + c0 = p1[i2 + (len_2 - tmp_wchar_len_1)] + cost_rep; + c1 = p1[i2 + (len_2 - tmp_wchar_len_1) + 1] + cost_del; + if (c1 < c0) { + c0 = c1; + } + c2 = p2[i2 + (len_2 - tmp_wchar_len_1)] + cost_ins; + if (c2 < c0) { + c0 = c2; + } + p2[i2 + (len_2 - tmp_wchar_len_1) + 1] = c0; + } + } else { + for (i2 = 0; i2 < tmp_wchar_len_2; i2++) { + c0 = p1[i2 + (len_2 - tmp_wchar_len_2)] + (wchar_buf_1[i1] == wchar_buf_2[i2] ? 0 : cost_rep); + c1 = p1[i2 + (len_2 - tmp_wchar_len_2) + 1] + cost_del; + if (c1 < c0) { + c0 = c1; + } + c2 = p2[i2 + (len_2 - tmp_wchar_len_2)] + cost_ins; + if (c2 < c0) { + c0 = c2; + } + p2[i2 + (len_2 - tmp_wchar_len_2) + 1] = c0; + } + } + tmp = p1; + p1 = p2; + p2 = tmp; + } + first = false; + } + + c0 = p1[strlen_2]; + efree(p1); + efree(p2); + + RETVAL_LONG(c0); +} + static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size) { const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0); diff --git a/ext/mbstring/mbstring.stub.php b/ext/mbstring/mbstring.stub.php index af9c5cbb93ea2..5fe3daac53297 100644 --- a/ext/mbstring/mbstring.stub.php +++ b/ext/mbstring/mbstring.stub.php @@ -145,6 +145,8 @@ function mb_ltrim(string $string, ?string $characters = null, ?string $encoding function mb_rtrim(string $string, ?string $characters = null, ?string $encoding = null): string {} +function mb_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, ?string $encoding = null): int {} + /** @refcount 1 */ function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {} diff --git a/ext/mbstring/mbstring_arginfo.h b/ext/mbstring/mbstring_arginfo.h index 230dddf96941c..681bf4711500c 100644 --- a/ext/mbstring/mbstring_arginfo.h +++ b/ext/mbstring/mbstring_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 03c07f68bea7d7b96e6dc11f180f45663b859ed3 */ + * Stub hash: de316945d5da7430183851eb3ac7dce2fc15b339 */ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_language, 0, 0, MAY_BE_STRING|MAY_BE_BOOL) ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, language, IS_STRING, 1, "null") @@ -132,6 +132,15 @@ ZEND_END_ARG_INFO() #define arginfo_mb_rtrim arginfo_mb_trim +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_mb_levenshtein, 0, 2, IS_LONG, 0) + ZEND_ARG_TYPE_INFO(0, string1, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, string2, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, insertion_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, replacement_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, deletion_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, encoding, IS_STRING, 1, "null") +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_detect_encoding, 0, 1, MAY_BE_STRING|MAY_BE_FALSE) ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0) ZEND_ARG_TYPE_MASK(0, encodings, MAY_BE_ARRAY|MAY_BE_STRING|MAY_BE_NULL, "null") @@ -327,6 +336,7 @@ ZEND_FUNCTION(mb_lcfirst); ZEND_FUNCTION(mb_trim); ZEND_FUNCTION(mb_ltrim); ZEND_FUNCTION(mb_rtrim); +ZEND_FUNCTION(mb_levenshtein); ZEND_FUNCTION(mb_detect_encoding); ZEND_FUNCTION(mb_list_encodings); ZEND_FUNCTION(mb_encoding_aliases); @@ -396,6 +406,7 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(mb_trim, arginfo_mb_trim) ZEND_FE(mb_ltrim, arginfo_mb_ltrim) ZEND_FE(mb_rtrim, arginfo_mb_rtrim) + ZEND_FE(mb_levenshtein, arginfo_mb_levenshtein) ZEND_FE(mb_detect_encoding, arginfo_mb_detect_encoding) ZEND_FE(mb_list_encodings, arginfo_mb_list_encodings) ZEND_FE(mb_encoding_aliases, arginfo_mb_encoding_aliases) diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt new file mode 100644 index 0000000000000..23c103334c0db --- /dev/null +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -0,0 +1,86 @@ +--TEST-- +mb_levenshtein() function test +--FILE-- + +--EXPECT-- +--- Equal --- +int(0) +--- First string empty --- +int(3) +--- Second string empty --- +int(3) +--- Both empty --- +int(0) +int(0) +--- 1 character --- +int(1) +--- 2 character swapped --- +int(2) +--- Inexpensive deletion --- +int(2) +--- Expensive deletion --- +int(10) +--- Inexpensive insertion --- +int(2) +--- Expensive insertion --- +int(10) +--- Expensive replacement --- +int(3) +--- Very expensive replacement --- +int(4) +--- 128 codepoints over --- +int(2) +--- 128 codepoints over only $string1 --- +int(128) +--- 128 codepoints over only $string2 --- +int(130) +--- 128 codepoints over Hiragana --- +int(2) +--- 128 codepoints over Hiragana in Shift_JIS --- +int(2) From aa2b209d07c3f035bbbb0fb7edfd286e2eaa0302 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Wed, 25 Sep 2024 16:30:31 +0900 Subject: [PATCH 02/16] Delete unused variable --- ext/mbstring/mbstring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 5a2313dc5cd96..d037fdcb4e57b 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3210,7 +3210,6 @@ PHP_FUNCTION(mb_levenshtein) zend_long *p1, *p2, *tmp; size_t strlen_1 = mb_get_strlen(string1, enc); size_t strlen_2 = mb_get_strlen(string2, enc); - size_t len_1 = 0; size_t len_2 = 0; size_t in_len_1 = ZSTR_LEN(string1); size_t in_len_2 = ZSTR_LEN(string2); @@ -3233,7 +3232,6 @@ PHP_FUNCTION(mb_levenshtein) while (in_len_1) { tmp_wchar_len_1 = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 128, &state); - len_1 += tmp_wchar_len_1; ZEND_ASSERT(in_len_1 <= 128); tmp_wchar_len_2 = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 128, &state); len_2 += tmp_wchar_len_2; From 4d8aa99a61b1950af41b2ed6d1135123e994abfc Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Wed, 25 Sep 2024 22:21:12 +0900 Subject: [PATCH 03/16] Fix out of memory --- ext/mbstring/mbstring.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index d037fdcb4e57b..bce7331e078ab 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3219,8 +3219,8 @@ PHP_FUNCTION(mb_levenshtein) zend_long c0, c1, c2; - p1 = safe_emalloc(strlen_1, sizeof(zend_long), 0); - p2 = safe_emalloc(strlen_2, sizeof(zend_long), 0); + p1 = safe_emalloc(strlen_1 + 1, sizeof(zend_long), 0); + p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); for (i2 = 0; i2 <= strlen_2; i2++) { p1[i2] = i2 * cost_ins; @@ -3232,29 +3232,43 @@ PHP_FUNCTION(mb_levenshtein) while (in_len_1) { tmp_wchar_len_1 = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 128, &state); - ZEND_ASSERT(in_len_1 <= 128); + ZEND_ASSERT(tmp_wchar_len_1 <= 128); tmp_wchar_len_2 = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 128, &state); len_2 += tmp_wchar_len_2; - ZEND_ASSERT(in_len_2 <= 128); + ZEND_ASSERT(tmp_wchar_len_2 <= 128); for (i1 = 0; i1 < tmp_wchar_len_1; i1++) { /* First loop that does not cross a 128 code points */ if (first) { p2[0] = p1[0] + cost_del; } - /* Insertion process when there is a surplus of 128 code points. */ if (tmp_wchar_len_2 == 0) { + /* Insertion process when there is a surplus of 128 code points. */ for (i2 = 0; i2 < tmp_wchar_len_1; i2++) { - c0 = p1[i2 + (len_2 - tmp_wchar_len_1)] + cost_rep; - c1 = p1[i2 + (len_2 - tmp_wchar_len_1) + 1] + cost_del; + /* for overflow */ + if (len_2 < tmp_wchar_len_1) { + c0 = p1[i2] + cost_rep; + c1 = p1[i2 + 1] + cost_del; + } else { + c0 = p1[i2 + (len_2 - tmp_wchar_len_1)] + cost_rep; + c1 = p1[i2 + (len_2 - tmp_wchar_len_1) + 1] + cost_del; + } if (c1 < c0) { c0 = c1; } - c2 = p2[i2 + (len_2 - tmp_wchar_len_1)] + cost_ins; + if (len_2 < tmp_wchar_len_1) { + c2 = p2[i2] + cost_ins; + } else { + c2 = p2[i2] + cost_ins; + } if (c2 < c0) { c0 = c2; } - p2[i2 + (len_2 - tmp_wchar_len_1) + 1] = c0; + if (len_2 < tmp_wchar_len_1) { + p2[i2 + 1] = c0; + } else { + p2[i2 + (len_2 - tmp_wchar_len_1) + 1] = c0; + } } } else { for (i2 = 0; i2 < tmp_wchar_len_2; i2++) { From 4b4f8a0761b8483a43b6c3c46c64af9183815c25 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Wed, 25 Sep 2024 23:50:36 +0900 Subject: [PATCH 04/16] Fix overflow (maybe) --- ext/mbstring/mbstring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index bce7331e078ab..96d8f87972ce9 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3244,7 +3244,7 @@ PHP_FUNCTION(mb_levenshtein) } if (tmp_wchar_len_2 == 0) { /* Insertion process when there is a surplus of 128 code points. */ - for (i2 = 0; i2 < tmp_wchar_len_1; i2++) { + for (i2 = 0; i2 < tmp_wchar_len_1 && len_2 != 0; i2++) { /* for overflow */ if (len_2 < tmp_wchar_len_1) { c0 = p1[i2] + cost_rep; From 952af9196166ed41177b5858005e09ab1ff188ec Mon Sep 17 00:00:00 2001 From: tekimen Date: Wed, 25 Sep 2024 09:19:31 -0700 Subject: [PATCH 05/16] Update ext/mbstring/mbstring.c Co-authored-by: Niels Dossche <7771979+nielsdos@users.noreply.github.com> --- ext/mbstring/mbstring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 96d8f87972ce9..ac92c3965d4f6 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3295,7 +3295,7 @@ PHP_FUNCTION(mb_levenshtein) efree(p1); efree(p2); - RETVAL_LONG(c0); + RETURN_LONG(c0); } static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size) From f764cbfc4086c6f3b64a4cc49c0e9671d843455e Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Thu, 26 Sep 2024 02:23:29 +0900 Subject: [PATCH 06/16] Fix zero byte to zero codepoint --- ext/mbstring/mbstring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index ac92c3965d4f6..296e177e9bd00 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3183,14 +3183,6 @@ PHP_FUNCTION(mb_levenshtein) Z_PARAM_STR_OR_NULL(enc_name) ZEND_PARSE_PARAMETERS_END(); - if (ZSTR_LEN(string1) == 0) { - RETVAL_LONG(ZSTR_LEN(string2) * cost_ins); - } - - if (ZSTR_LEN(string2) == 0) { - RETVAL_LONG(ZSTR_LEN(string1) * cost_del); - } - const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 6); if (!enc) { RETURN_THROWS(); @@ -3217,6 +3209,14 @@ PHP_FUNCTION(mb_levenshtein) unsigned char *in_2 = (unsigned char*)ZSTR_VAL(string2); unsigned int state = 0; + if (strlen_1 == 0) { + RETURN_LONG(strlen_2 * cost_ins); + } + + if (strlen_2 == 0) { + RETURN_LONG(strlen_1 * cost_ins); + } + zend_long c0, c1, c2; p1 = safe_emalloc(strlen_1 + 1, sizeof(zend_long), 0); From 8108bc23c417bd6ebbbb341202d94fb263443563 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Thu, 26 Sep 2024 16:25:35 +0900 Subject: [PATCH 07/16] Fix asan memory error --- ext/mbstring/mbstring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 296e177e9bd00..6b845d9baf1ea 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3219,7 +3219,7 @@ PHP_FUNCTION(mb_levenshtein) zend_long c0, c1, c2; - p1 = safe_emalloc(strlen_1 + 1, sizeof(zend_long), 0); + p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); for (i2 = 0; i2 <= strlen_2; i2++) { From 3bedd8704ac197ed5921fc6864bd5d8ca2649b76 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 27 Sep 2024 00:52:05 +0900 Subject: [PATCH 08/16] Add test code and remove unnecessary code --- ext/mbstring/mbstring.c | 24 +++---------- ext/mbstring/tests/mb_levenshtein.phpt | 48 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 6b845d9baf1ea..0b37a4e0b2dcc 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3244,31 +3244,17 @@ PHP_FUNCTION(mb_levenshtein) } if (tmp_wchar_len_2 == 0) { /* Insertion process when there is a surplus of 128 code points. */ - for (i2 = 0; i2 < tmp_wchar_len_1 && len_2 != 0; i2++) { - /* for overflow */ - if (len_2 < tmp_wchar_len_1) { - c0 = p1[i2] + cost_rep; - c1 = p1[i2 + 1] + cost_del; - } else { - c0 = p1[i2 + (len_2 - tmp_wchar_len_1)] + cost_rep; - c1 = p1[i2 + (len_2 - tmp_wchar_len_1) + 1] + cost_del; - } + for (i2 = 0; i2 < tmp_wchar_len_1; i2++) { + c0 = p1[i2 + (len_2 - tmp_wchar_len_1)] + cost_rep; + c1 = p1[i2 + (len_2 - tmp_wchar_len_1) + 1] + cost_del; if (c1 < c0) { c0 = c1; } - if (len_2 < tmp_wchar_len_1) { - c2 = p2[i2] + cost_ins; - } else { - c2 = p2[i2] + cost_ins; - } + c2 = p2[i2] + cost_ins; if (c2 < c0) { c0 = c2; } - if (len_2 < tmp_wchar_len_1) { - p2[i2 + 1] = c0; - } else { - p2[i2 + (len_2 - tmp_wchar_len_1) + 1] = c0; - } + p2[i2 + (len_2 - tmp_wchar_len_1) + 1] = c0; } } else { for (i2 = 0; i2 < tmp_wchar_len_2; i2++) { diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index 23c103334c0db..378ff6a16d89c 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -47,6 +47,45 @@ echo '--- 128 codepoints over Hiragana in Shift_JIS ---' . \PHP_EOL; $hiragana_a = mb_convert_encoding("あ", "SJIS", "UTF-8"); $hiragana_aiu = mb_convert_encoding("あいう", "SJIS", "UTF-8"); var_dump(mb_levenshtein(str_repeat($hiragana_a, 128 + 3), str_repeat($hiragana_a, 128) . $hiragana_aiu, encoding: "SJIS")); + +echo '--- Usecase of userland code ---' . \PHP_EOL; +/* from: https://qiita.com/mpyw/items/2b636827730e06c71e3d */ +$query = 'ほあようごぁいまーしゅ'; +$comps = [ + 'こんにちはー', + 'おはようございまーす', + 'こんばんはー', + 'おやすみなさーい', + 'いただきまーす', + 'おつかれさまー', + 'ぬぁあああんつかれたもぉぉぉぉぉぉん', +]; +$min = 99999; +$min_key = 0; +foreach ($comps as $key => $comp) { + $sim = mb_levenshtein($query, $comp); + if ($min >= $sim) { + $min = $sim; + $min_key = $key; + } +} +var_dump($comps[$min_key]); + +$base = 'やんほぬ'; +$comps = [ + 'かんのみほ', + 'かんのみほう', + 'かんぺみろ', + 'ああいいふろ', + 'ちゃんとみろ', + 'ターミナルさん', +]; +foreach ($comps as $comp) { + var_dump(mb_levenshtein($base, $comp)); +} + +/* from: https://qiita.com/suin/items/a0a8227addad11ff2ea7 */ +var_dump(mb_levenshtein('あとうかい', 'かとうあい')); // int(2) ?> --EXPECT-- --- Equal --- @@ -84,3 +123,12 @@ int(130) int(2) --- 128 codepoints over Hiragana in Shift_JIS --- int(2) +--- Usecase of userland code --- +string(30) "おはようございまーす" +int(4) +int(4) +int(4) +int(6) +int(5) +int(7) +int(2) From 9233ecc58dabb5592bc1aa396a6cae98b2db6fdc Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 27 Sep 2024 17:36:54 +0900 Subject: [PATCH 09/16] Add ISO-8859-1 pattern --- ext/mbstring/tests/mb_levenshtein.phpt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index 378ff6a16d89c..42000208178ab 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -34,6 +34,8 @@ var_dump(mb_levenshtein('111', '121', 2, 3, 2)); echo '--- Very expensive replacement ---' . \PHP_EOL; var_dump(mb_levenshtein('111', '121', 2, 9, 2)); +echo '--- 128 codepoints ---' . \PHP_EOL; +var_dump(mb_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc")); echo '--- 128 codepoints over ---' . \PHP_EOL; var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa")); echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL; @@ -48,6 +50,10 @@ $hiragana_a = mb_convert_encoding("あ", "SJIS", "UTF-8"); $hiragana_aiu = mb_convert_encoding("あいう", "SJIS", "UTF-8"); var_dump(mb_levenshtein(str_repeat($hiragana_a, 128 + 3), str_repeat($hiragana_a, 128) . $hiragana_aiu, encoding: "SJIS")); +echo '--- café in ISO-8859-1 ---' . \PHP_EOL; +$cafe = mb_convert_encoding("café", "ISO-8859-1", "UTF-8"); +var_dump(mb_levenshtein("cafe", $cafe, encoding: "ISO-8859-1")); + echo '--- Usecase of userland code ---' . \PHP_EOL; /* from: https://qiita.com/mpyw/items/2b636827730e06c71e3d */ $query = 'ほあようごぁいまーしゅ'; @@ -113,6 +119,8 @@ int(10) int(3) --- Very expensive replacement --- int(4) +--- 128 codepoints --- +int(2) --- 128 codepoints over --- int(2) --- 128 codepoints over only $string1 --- @@ -123,6 +131,8 @@ int(130) int(2) --- 128 codepoints over Hiragana in Shift_JIS --- int(2) +--- café in ISO-8859-1 --- +int(1) --- Usecase of userland code --- string(30) "おはようございまーす" int(4) From cf56777eb5c4deb5bc16d7a0872be3fa362c875d Mon Sep 17 00:00:00 2001 From: tekimen Date: Fri, 27 Sep 2024 21:45:03 +0900 Subject: [PATCH 10/16] Update ext/mbstring/tests/mb_levenshtein.phpt Co-authored-by: Christoph M. Becker --- ext/mbstring/tests/mb_levenshtein.phpt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index 42000208178ab..8ad73d2c60c94 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -1,5 +1,7 @@ --TEST-- mb_levenshtein() function test +--EXTENSIONS-- +mbstring --FILE-- Date: Mon, 30 Sep 2024 00:25:59 +0900 Subject: [PATCH 11/16] Update test code using from KEINOS/mb_levenshtein A userland function to compare multiple code points. We uses that code for test code. --- ext/mbstring/tests/mb_levenshtein.phpt | 53 ++--------- .../tests/mb_levenshtein_userland.inc | 88 +++++++++++++++++++ 2 files changed, 97 insertions(+), 44 deletions(-) create mode 100644 ext/mbstring/tests/mb_levenshtein_userland.inc diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index 8ad73d2c60c94..fab2f99e3107d 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -4,7 +4,7 @@ mb_levenshtein() function test mbstring --FILE-- $comp) { - $sim = mb_levenshtein($query, $comp); - if ($min >= $sim) { - $min = $sim; - $min_key = $key; - } -} -var_dump($comps[$min_key]); -$base = 'やんほぬ'; -$comps = [ - 'かんのみほ', - 'かんのみほう', - 'かんぺみろ', - 'ああいいふろ', - 'ちゃんとみろ', - 'ターミナルさん', -]; -foreach ($comps as $comp) { - var_dump(mb_levenshtein($base, $comp)); +for ($i = 0; $i < 100; $i++) { + $bytes = implode("", array_map(function ($byte) { return mb_chr(intval(bin2hex($byte), 16), "UTF-8"); }, str_split(random_bytes(10)))); + $compare = "あいうえおABCDEF"; + if (mb_levenshtein($bytes, $compare) !== keinos_mb_levenshtein($bytes, $compare)) { + throw Exception("mb_levenshtein compare error: {$bytes}"); + } } - -/* from: https://qiita.com/suin/items/a0a8227addad11ff2ea7 */ -var_dump(mb_levenshtein('あとうかい', 'かとうあい')); // int(2) +echo "OK" . PHP_EOL; ?> --EXPECT-- --- Equal --- @@ -136,11 +108,4 @@ int(2) --- café in ISO-8859-1 --- int(1) --- Usecase of userland code --- -string(30) "おはようございまーす" -int(4) -int(4) -int(4) -int(6) -int(5) -int(7) -int(2) +OK diff --git a/ext/mbstring/tests/mb_levenshtein_userland.inc b/ext/mbstring/tests/mb_levenshtein_userland.inc new file mode 100644 index 0000000000000..12236ae53bb5a --- /dev/null +++ b/ext/mbstring/tests/mb_levenshtein_userland.inc @@ -0,0 +1,88 @@ + Date: Tue, 1 Oct 2024 16:45:16 +0900 Subject: [PATCH 12/16] Implove create random strings --- ext/mbstring/tests/mb_levenshtein.phpt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index fab2f99e3107d..affc51a5fe3b3 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -59,7 +59,10 @@ var_dump(mb_levenshtein("cafe", $cafe, encoding: "ISO-8859-1")); echo '--- Usecase of userland code ---' . \PHP_EOL; for ($i = 0; $i < 100; $i++) { - $bytes = implode("", array_map(function ($byte) { return mb_chr(intval(bin2hex($byte), 16), "UTF-8"); }, str_split(random_bytes(10)))); + $bytes = ""; + for ($j = 0; $j < 10; $j++) { + $bytes .= mb_chr(mt_rand(0, 0x10FFF)); + } $compare = "あいうえおABCDEF"; if (mb_levenshtein($bytes, $compare) !== keinos_mb_levenshtein($bytes, $compare)) { throw Exception("mb_levenshtein compare error: {$bytes}"); From 4f255f70c128082e32b70ee29fc03dce7a12943d Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 4 Oct 2024 00:58:17 +0900 Subject: [PATCH 13/16] Fix mb_levenshtein to per codepoint Add watchstate's test code --- ext/mbstring/mbstring.c | 72 ++++--------- ext/mbstring/tests/mb_levenshtein.phpt | 12 ++- .../tests/mb_levenshtein_userland.inc | 101 +++++++----------- 3 files changed, 72 insertions(+), 113 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 0b37a4e0b2dcc..a976d27913de8 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3197,12 +3197,11 @@ PHP_FUNCTION(mb_levenshtein) string2 = tmp; } - uint32_t wchar_buf_1[128], wchar_buf_2[128]; + uint32_t wchar_buf_1[1], wchar_buf_2[1]; size_t i1, i2; zend_long *p1, *p2, *tmp; size_t strlen_1 = mb_get_strlen(string1, enc); size_t strlen_2 = mb_get_strlen(string2, enc); - size_t len_2 = 0; size_t in_len_1 = ZSTR_LEN(string1); size_t in_len_2 = ZSTR_LEN(string2); unsigned char *in_1 = (unsigned char*)ZSTR_VAL(string1); @@ -3225,56 +3224,31 @@ PHP_FUNCTION(mb_levenshtein) for (i2 = 0; i2 <= strlen_2; i2++) { p1[i2] = i2 * cost_ins; } - - zend_long tmp_wchar_len_1 = 0; - zend_long tmp_wchar_len_2 = 0; - bool first = true; - - while (in_len_1) { - tmp_wchar_len_1 = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 128, &state); - ZEND_ASSERT(tmp_wchar_len_1 <= 128); - tmp_wchar_len_2 = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 128, &state); - len_2 += tmp_wchar_len_2; - ZEND_ASSERT(tmp_wchar_len_2 <= 128); - - for (i1 = 0; i1 < tmp_wchar_len_1; i1++) { - /* First loop that does not cross a 128 code points */ - if (first) { - p2[0] = p1[0] + cost_del; + zend_long tmp_wchar_len; + + for (i1 = 0; i1 < strlen_1; i1++) { + tmp_wchar_len = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 1, &state); + ZEND_ASSERT(tmp_wchar_len <= 1); + p2[0] = p1[0] + cost_del; + for (i2 = 0; i2 < strlen_2; i2++) { + tmp_wchar_len = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 1, &state); + ZEND_ASSERT(tmp_wchar_len <= 1); + c0 = p1[i2] + (wchar_buf_1[0] == wchar_buf_2[0] ? 0 : cost_rep); + c1 = p1[i2 + 1] + cost_del; + if (c1 < c0) { + c0 = c1; } - if (tmp_wchar_len_2 == 0) { - /* Insertion process when there is a surplus of 128 code points. */ - for (i2 = 0; i2 < tmp_wchar_len_1; i2++) { - c0 = p1[i2 + (len_2 - tmp_wchar_len_1)] + cost_rep; - c1 = p1[i2 + (len_2 - tmp_wchar_len_1) + 1] + cost_del; - if (c1 < c0) { - c0 = c1; - } - c2 = p2[i2] + cost_ins; - if (c2 < c0) { - c0 = c2; - } - p2[i2 + (len_2 - tmp_wchar_len_1) + 1] = c0; - } - } else { - for (i2 = 0; i2 < tmp_wchar_len_2; i2++) { - c0 = p1[i2 + (len_2 - tmp_wchar_len_2)] + (wchar_buf_1[i1] == wchar_buf_2[i2] ? 0 : cost_rep); - c1 = p1[i2 + (len_2 - tmp_wchar_len_2) + 1] + cost_del; - if (c1 < c0) { - c0 = c1; - } - c2 = p2[i2 + (len_2 - tmp_wchar_len_2)] + cost_ins; - if (c2 < c0) { - c0 = c2; - } - p2[i2 + (len_2 - tmp_wchar_len_2) + 1] = c0; - } + c2 = p2[i2] + cost_ins; + if (c2 < c0) { + c0 = c2; } - tmp = p1; - p1 = p2; - p2 = tmp; + p2[i2 + 1] = c0; } - first = false; + in_2 = (unsigned char*)ZSTR_VAL(string2); + in_len_2 = ZSTR_LEN(string2); + tmp = p1; + p1 = p2; + p2 = tmp; } c0 = p1[strlen_2]; diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index affc51a5fe3b3..4e56204518d04 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -40,6 +40,7 @@ echo '--- 128 codepoints ---' . \PHP_EOL; var_dump(mb_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc")); echo '--- 128 codepoints over ---' . \PHP_EOL; var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa")); +var_dump(mb_levenshtein(str_repeat("a", 256) . "abc", "aaa")); echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL; var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", "aaa")); echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL; @@ -58,14 +59,16 @@ var_dump(mb_levenshtein("cafe", $cafe, encoding: "ISO-8859-1")); echo '--- Usecase of userland code ---' . \PHP_EOL; +$bytes = ""; for ($i = 0; $i < 100; $i++) { - $bytes = ""; for ($j = 0; $j < 10; $j++) { - $bytes .= mb_chr(mt_rand(0, 0x10FFF)); + $bytes .= mb_chr(mt_rand(0, 0xFFFF)); } $compare = "あいうえおABCDEF"; - if (mb_levenshtein($bytes, $compare) !== keinos_mb_levenshtein($bytes, $compare)) { - throw Exception("mb_levenshtein compare error: {$bytes}"); + $mb_levenshtein_score = mb_levenshtein($bytes, $compare, encoding: "UTF-8"); + $watchstate_mb_levenshtein_score = watchstate_mb_levenshtein($bytes, $compare); + if ($mb_levenshtein_score !== $watchstate_mb_levenshtein_score) { + throw new Exception("mb_levenshtein compare error: {$mb_levenshtein_score} !== {$keinos_mb_levenshtein_score} param: {$bytes} vs {$compare}"); } } echo "OK" . PHP_EOL; @@ -100,6 +103,7 @@ int(4) int(2) --- 128 codepoints over --- int(2) +int(256) --- 128 codepoints over only $string1 --- int(128) --- 128 codepoints over only $string2 --- diff --git a/ext/mbstring/tests/mb_levenshtein_userland.inc b/ext/mbstring/tests/mb_levenshtein_userland.inc index 12236ae53bb5a..a0cadce86cc43 100644 --- a/ext/mbstring/tests/mb_levenshtein_userland.inc +++ b/ext/mbstring/tests/mb_levenshtein_userland.inc @@ -1,15 +1,14 @@ Date: Wed, 9 Oct 2024 01:57:11 +0900 Subject: [PATCH 14/16] Add test case for variable selector --- ext/mbstring/tests/mb_levenshtein.phpt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index 4e56204518d04..b666b59705c11 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -57,6 +57,16 @@ echo '--- café in ISO-8859-1 ---' . \PHP_EOL; $cafe = mb_convert_encoding("café", "ISO-8859-1", "UTF-8"); var_dump(mb_levenshtein("cafe", $cafe, encoding: "ISO-8859-1")); +echo '--- Variable selector ---' . \PHP_EOL; +$ka = "カ́"; +var_dump(mb_levenshtein("カ", $ka, encoding: "UTF-8")); +// variable $nabe and $nabe_E0100 is seems nothing different. +// However, $nabe_E0100 is variable selector in U+908A U+E0100. +// Therfore, this result is 1. +$nabe = '邊'; +$nabe_E0100 = "邊󠄀"; +var_dump(mb_levenshtein($nabe, $nabe_E0100, encoding: "UTF-8")); + echo '--- Usecase of userland code ---' . \PHP_EOL; $bytes = ""; @@ -114,5 +124,8 @@ int(2) int(2) --- café in ISO-8859-1 --- int(1) +--- Variable selector --- +int(1) +int(1) --- Usecase of userland code --- OK From 283512b35a98dd63b82df8b97d35318b9c545914 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Mon, 14 Oct 2024 16:22:52 +0900 Subject: [PATCH 15/16] Add test case of emoji. I think one of usecase is compare per codepoint emoji. --- ext/mbstring/tests/mb_levenshtein.phpt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index b666b59705c11..59b96e723f31d 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -82,6 +82,9 @@ for ($i = 0; $i < 100; $i++) { } } echo "OK" . PHP_EOL; +echo '--- Usecase of Emoji ---' . \PHP_EOL; +var_dump(mb_levenshtein("🙇‍♀️", "🙇‍♂️")); +var_dump(mb_levenshtein("🙇", "🙇‍♂️")); ?> --EXPECT-- --- Equal --- @@ -129,3 +132,6 @@ int(1) int(1) --- Usecase of userland code --- OK +--- Usecase of Emoji --- +int(1) +int(3) From 916887e17fc9ef759ffdf1878824fd13b95123aa Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Thu, 6 Feb 2025 11:41:50 +0900 Subject: [PATCH 16/16] If testcase if failed, output to UTF-16. --- ext/mbstring/tests/mb_levenshtein.phpt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ext/mbstring/tests/mb_levenshtein.phpt b/ext/mbstring/tests/mb_levenshtein.phpt index 59b96e723f31d..7c817b889eb36 100644 --- a/ext/mbstring/tests/mb_levenshtein.phpt +++ b/ext/mbstring/tests/mb_levenshtein.phpt @@ -78,7 +78,9 @@ for ($i = 0; $i < 100; $i++) { $mb_levenshtein_score = mb_levenshtein($bytes, $compare, encoding: "UTF-8"); $watchstate_mb_levenshtein_score = watchstate_mb_levenshtein($bytes, $compare); if ($mb_levenshtein_score !== $watchstate_mb_levenshtein_score) { - throw new Exception("mb_levenshtein compare error: {$mb_levenshtein_score} !== {$keinos_mb_levenshtein_score} param: {$bytes} vs {$compare}"); + $hexbytes = bin2hex(mb_convert_encoding($bytes, "UTF-16", "UTF-8")); + $hexcompare = bin2hex(mb_convert_encoding($compare, "UTF-16", "UTF-8")); + throw new Exception("mb_levenshtein compare error: {$mb_levenshtein_score} !== {$keinos_mb_levenshtein_score} param: {$hexbytes} vs {$hexcompare}"); } } echo "OK" . PHP_EOL;