Skip to content

[Draft][Require RFC] mb_levenshtein function #16043

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -3166,6 +3166,98 @@ PHP_FUNCTION(mb_rtrim)
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
}

PHP_FUNCTION(mb_levenshtein)
{
zend_string *string1, *string2, *enc_name = NULL;
zend_long cost_ins = 1;
zend_long cost_rep = 1;
zend_long cost_del = 1;

ZEND_PARSE_PARAMETERS_START(2, 6)
Z_PARAM_STR(string1)
Z_PARAM_STR(string2)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(cost_ins)
Z_PARAM_LONG(cost_rep)
Z_PARAM_LONG(cost_del)
Z_PARAM_STR_OR_NULL(enc_name)
ZEND_PARSE_PARAMETERS_END();

const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 6);
if (!enc) {
RETURN_THROWS();
}

/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
* that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
* by having shorter rows (p1 & p2). */
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
zend_string *tmp = string1;
string1 = string2;
string2 = tmp;
}

uint32_t wchar_buf_1[1], wchar_buf_2[1];
size_t i1, i2;
zend_long *p1, *p2, *tmp;
size_t strlen_1 = mb_get_strlen(string1, enc);
size_t strlen_2 = mb_get_strlen(string2, enc);
size_t in_len_1 = ZSTR_LEN(string1);
size_t in_len_2 = ZSTR_LEN(string2);
unsigned char *in_1 = (unsigned char*)ZSTR_VAL(string1);
unsigned char *in_2 = (unsigned char*)ZSTR_VAL(string2);
unsigned int state = 0;

if (strlen_1 == 0) {
RETURN_LONG(strlen_2 * cost_ins);
}

if (strlen_2 == 0) {
RETURN_LONG(strlen_1 * cost_ins);
}

zend_long c0, c1, c2;

p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);

for (i2 = 0; i2 <= strlen_2; i2++) {
p1[i2] = i2 * cost_ins;
}
zend_long tmp_wchar_len;

for (i1 = 0; i1 < strlen_1; i1++) {
tmp_wchar_len = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 1, &state);
ZEND_ASSERT(tmp_wchar_len <= 1);
p2[0] = p1[0] + cost_del;
for (i2 = 0; i2 < strlen_2; i2++) {
tmp_wchar_len = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 1, &state);
ZEND_ASSERT(tmp_wchar_len <= 1);
c0 = p1[i2] + (wchar_buf_1[0] == wchar_buf_2[0] ? 0 : cost_rep);
c1 = p1[i2 + 1] + cost_del;
if (c1 < c0) {
c0 = c1;
}
c2 = p2[i2] + cost_ins;
if (c2 < c0) {
c0 = c2;
}
p2[i2 + 1] = c0;
}
in_2 = (unsigned char*)ZSTR_VAL(string2);
in_len_2 = ZSTR_LEN(string2);
tmp = p1;
p1 = p2;
p2 = tmp;
}

c0 = p1[strlen_2];
efree(p1);
efree(p2);

RETURN_LONG(c0);
}

static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
{
const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);
Expand Down
2 changes: 2 additions & 0 deletions ext/mbstring/mbstring.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ function mb_ltrim(string $string, ?string $characters = null, ?string $encoding

function mb_rtrim(string $string, ?string $characters = null, ?string $encoding = null): string {}

function mb_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, ?string $encoding = null): int {}

/** @refcount 1 */
function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}

Expand Down
13 changes: 12 additions & 1 deletion ext/mbstring/mbstring_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

139 changes: 139 additions & 0 deletions ext/mbstring/tests/mb_levenshtein.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
--TEST--
mb_levenshtein() function test
--EXTENSIONS--
mbstring
--FILE--
<?php
require_once("mb_levenshtein_userland.inc");
echo '--- Equal ---' . \PHP_EOL;
var_dump(mb_levenshtein('12345', '12345'));

echo '--- First string empty ---' . \PHP_EOL;
var_dump(mb_levenshtein('', 'xyz'));
echo '--- Second string empty ---' . \PHP_EOL;
var_dump(mb_levenshtein('xyz', ''));
echo '--- Both empty ---' . \PHP_EOL;
var_dump(mb_levenshtein('', ''));
var_dump(mb_levenshtein('', '', 10, 10, 10));

echo '--- 1 character ---' . \PHP_EOL;
var_dump(mb_levenshtein('1', '2'));
echo '--- 2 character swapped ---' . \PHP_EOL;
var_dump(mb_levenshtein('12', '21'));

echo '--- Inexpensive deletion ---' . \PHP_EOL;
var_dump(mb_levenshtein('2121', '11', 2));
echo '--- Expensive deletion ---' . \PHP_EOL;
var_dump(mb_levenshtein('2121', '11', 2, 1, 5));

echo '--- Inexpensive insertion ---' . \PHP_EOL;
var_dump(mb_levenshtein('11', '2121'));
echo '--- Expensive insertion ---' . \PHP_EOL;
var_dump(mb_levenshtein('11', '2121', 5));

echo '--- Expensive replacement ---' . \PHP_EOL;
var_dump(mb_levenshtein('111', '121', 2, 3, 2));
echo '--- Very expensive replacement ---' . \PHP_EOL;
var_dump(mb_levenshtein('111', '121', 2, 9, 2));

echo '--- 128 codepoints ---' . \PHP_EOL;
var_dump(mb_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc"));
echo '--- 128 codepoints over ---' . \PHP_EOL;
var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa"));
var_dump(mb_levenshtein(str_repeat("a", 256) . "abc", "aaa"));
echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL;
var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", "aaa"));
echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL;
var_dump(mb_levenshtein("abc", str_repeat("a", 128) . "aaa"));
echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL;
var_dump(mb_levenshtein(str_repeat("あ", 128) . "あああ", str_repeat("あ", 128) . "あいう"));

echo '--- 128 codepoints over Hiragana in Shift_JIS ---' . \PHP_EOL;
$hiragana_a = mb_convert_encoding("あ", "SJIS", "UTF-8");
$hiragana_aiu = mb_convert_encoding("あいう", "SJIS", "UTF-8");
var_dump(mb_levenshtein(str_repeat($hiragana_a, 128 + 3), str_repeat($hiragana_a, 128) . $hiragana_aiu, encoding: "SJIS"));

echo '--- café in ISO-8859-1 ---' . \PHP_EOL;
$cafe = mb_convert_encoding("café", "ISO-8859-1", "UTF-8");
var_dump(mb_levenshtein("cafe", $cafe, encoding: "ISO-8859-1"));

echo '--- Variable selector ---' . \PHP_EOL;
$ka = "カ́";
var_dump(mb_levenshtein("カ", $ka, encoding: "UTF-8"));
// variable $nabe and $nabe_E0100 is seems nothing different.
// However, $nabe_E0100 is variable selector in U+908A U+E0100.
// Therfore, this result is 1.
$nabe = '邊';
$nabe_E0100 = "邊󠄀";
var_dump(mb_levenshtein($nabe, $nabe_E0100, encoding: "UTF-8"));

echo '--- Usecase of userland code ---' . \PHP_EOL;

$bytes = "";
for ($i = 0; $i < 100; $i++) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you are using 100 random test cases here. Maybe you are afraid of making the tests too slow? But unless the Levenshtein algorithm is very expensive, I think increasing this to 1000 or 10,000 shouldn't affect runtime of the test much. Am I wrong?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See https://en.wikipedia.org/wiki/Levenshtein_distance#Computational_complexity. So yes, the algorithm is rather expensive, but unless the test is slower than ~1sec, we can increase the number of iterations. And we might increase even further when marking the test as SLOW_TEST.

for ($j = 0; $j < 10; $j++) {
$bytes .= mb_chr(mt_rand(0, 0xFFFF));
}
$compare = "あいうえおABCDEF";
$mb_levenshtein_score = mb_levenshtein($bytes, $compare, encoding: "UTF-8");
$watchstate_mb_levenshtein_score = watchstate_mb_levenshtein($bytes, $compare);
if ($mb_levenshtein_score !== $watchstate_mb_levenshtein_score) {
$hexbytes = bin2hex(mb_convert_encoding($bytes, "UTF-16", "UTF-8"));
$hexcompare = bin2hex(mb_convert_encoding($compare, "UTF-16", "UTF-8"));
throw new Exception("mb_levenshtein compare error: {$mb_levenshtein_score} !== {$keinos_mb_levenshtein_score} param: {$hexbytes} vs {$hexcompare}");
}
}
echo "OK" . PHP_EOL;
echo '--- Usecase of Emoji ---' . \PHP_EOL;
var_dump(mb_levenshtein("🙇‍♀️", "🙇‍♂️"));
var_dump(mb_levenshtein("🙇", "🙇‍♂️"));
?>
--EXPECT--
--- Equal ---
int(0)
--- First string empty ---
int(3)
--- Second string empty ---
int(3)
--- Both empty ---
int(0)
int(0)
--- 1 character ---
int(1)
--- 2 character swapped ---
int(2)
--- Inexpensive deletion ---
int(2)
--- Expensive deletion ---
int(10)
--- Inexpensive insertion ---
int(2)
--- Expensive insertion ---
int(10)
--- Expensive replacement ---
int(3)
--- Very expensive replacement ---
int(4)
--- 128 codepoints ---
int(2)
--- 128 codepoints over ---
int(2)
int(256)
--- 128 codepoints over only $string1 ---
int(128)
--- 128 codepoints over only $string2 ---
int(130)
--- 128 codepoints over Hiragana ---
int(2)
--- 128 codepoints over Hiragana in Shift_JIS ---
int(2)
--- café in ISO-8859-1 ---
int(1)
--- Variable selector ---
int(1)
int(1)
--- Usecase of userland code ---
OK
--- Usecase of Emoji ---
int(1)
int(3)
69 changes: 69 additions & 0 deletions ext/mbstring/tests/mb_levenshtein_userland.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<?php
/* from: https://github.com/arabcoders/watchstate/blob/18e048b3a692aa0f46d34bde03c8518854907e4f/src/API/Backend/Mismatched.php#L331 */
/**
* Copyright (c) 2024 ArabCoders
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

/**
* Implementation levenshtein distance algorithm.
*
* @param string $str1 The first string.
* @param string $str2 The second string.
*
* @return int The Levenshtein distance between the two strings.
*/
function watchstate_mb_levenshtein(string $str1, string $str2): int
{
$length1 = mb_strlen($str1, 'UTF-8');
$length2 = mb_strlen($str2, 'UTF-8');

if ($length1 < $length2) {
return watchstate_mb_levenshtein($str2, $str1);
}

if (0 === $length1) {
return $length2;
}

if ($str1 === $str2) {
return 0;
}

$prevRow = range(0, $length2);

for ($i = 0; $i < $length1; $i++) {
$currentRow = [];
$currentRow[0] = $i + 1;
$c1 = mb_substr($str1, $i, 1, 'UTF-8');

for ($j = 0; $j < $length2; $j++) {
$c2 = mb_substr($str2, $j, 1, 'UTF-8');
$insertions = $prevRow[$j + 1] + 1;
$deletions = $currentRow[$j] + 1;
$substitutions = $prevRow[$j] + (($c1 !== $c2) ? 1 : 0);
$currentRow[] = min($insertions, $deletions, $substitutions);
}

$prevRow = $currentRow;
}
return $prevRow[$length2];
}