-
Notifications
You must be signed in to change notification settings - Fork 7.9k
[Draft][Require RFC] mb_levenshtein function #16043
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
af72b0b
aa2b209
4d8aa99
4b4f8a0
952af91
f764cbf
8108bc2
3bedd87
9233ecc
cf56777
d27cfa0
4fbb4d4
4f255f7
e4de70f
283512b
916887e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
--TEST-- | ||
mb_levenshtein() function test | ||
--EXTENSIONS-- | ||
mbstring | ||
--FILE-- | ||
<?php | ||
require_once("mb_levenshtein_userland.inc"); | ||
echo '--- Equal ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('12345', '12345')); | ||
|
||
echo '--- First string empty ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('', 'xyz')); | ||
echo '--- Second string empty ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('xyz', '')); | ||
echo '--- Both empty ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('', '')); | ||
var_dump(mb_levenshtein('', '', 10, 10, 10)); | ||
|
||
echo '--- 1 character ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('1', '2')); | ||
echo '--- 2 character swapped ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('12', '21')); | ||
|
||
echo '--- Inexpensive deletion ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('2121', '11', 2)); | ||
echo '--- Expensive deletion ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('2121', '11', 2, 1, 5)); | ||
|
||
echo '--- Inexpensive insertion ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('11', '2121')); | ||
echo '--- Expensive insertion ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('11', '2121', 5)); | ||
|
||
echo '--- Expensive replacement ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('111', '121', 2, 3, 2)); | ||
echo '--- Very expensive replacement ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein('111', '121', 2, 9, 2)); | ||
|
||
echo '--- 128 codepoints ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc")); | ||
echo '--- 128 codepoints over ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa")); | ||
var_dump(mb_levenshtein(str_repeat("a", 256) . "abc", "aaa")); | ||
echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", "aaa")); | ||
echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein("abc", str_repeat("a", 128) . "aaa")); | ||
echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein(str_repeat("あ", 128) . "あああ", str_repeat("あ", 128) . "あいう")); | ||
|
||
echo '--- 128 codepoints over Hiragana in Shift_JIS ---' . \PHP_EOL; | ||
$hiragana_a = mb_convert_encoding("あ", "SJIS", "UTF-8"); | ||
$hiragana_aiu = mb_convert_encoding("あいう", "SJIS", "UTF-8"); | ||
var_dump(mb_levenshtein(str_repeat($hiragana_a, 128 + 3), str_repeat($hiragana_a, 128) . $hiragana_aiu, encoding: "SJIS")); | ||
|
||
echo '--- café in ISO-8859-1 ---' . \PHP_EOL; | ||
$cafe = mb_convert_encoding("café", "ISO-8859-1", "UTF-8"); | ||
var_dump(mb_levenshtein("cafe", $cafe, encoding: "ISO-8859-1")); | ||
|
||
echo '--- Variable selector ---' . \PHP_EOL; | ||
$ka = "カ́"; | ||
var_dump(mb_levenshtein("カ", $ka, encoding: "UTF-8")); | ||
// variable $nabe and $nabe_E0100 is seems nothing different. | ||
// However, $nabe_E0100 is variable selector in U+908A U+E0100. | ||
// Therfore, this result is 1. | ||
$nabe = '邊'; | ||
$nabe_E0100 = "邊󠄀"; | ||
var_dump(mb_levenshtein($nabe, $nabe_E0100, encoding: "UTF-8")); | ||
|
||
echo '--- Usecase of userland code ---' . \PHP_EOL; | ||
|
||
$bytes = ""; | ||
for ($i = 0; $i < 100; $i++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see you are using 100 random test cases here. Maybe you are afraid of making the tests too slow? But unless the Levenshtein algorithm is very expensive, I think increasing this to 1000 or 10,000 shouldn't affect runtime of the test much. Am I wrong? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See https://en.wikipedia.org/wiki/Levenshtein_distance#Computational_complexity. So yes, the algorithm is rather expensive, but unless the test is slower than ~1sec, we can increase the number of iterations. And we might increase even further when marking the test as SLOW_TEST. |
||
for ($j = 0; $j < 10; $j++) { | ||
$bytes .= mb_chr(mt_rand(0, 0xFFFF)); | ||
} | ||
$compare = "あいうえおABCDEF"; | ||
$mb_levenshtein_score = mb_levenshtein($bytes, $compare, encoding: "UTF-8"); | ||
$watchstate_mb_levenshtein_score = watchstate_mb_levenshtein($bytes, $compare); | ||
if ($mb_levenshtein_score !== $watchstate_mb_levenshtein_score) { | ||
$hexbytes = bin2hex(mb_convert_encoding($bytes, "UTF-16", "UTF-8")); | ||
$hexcompare = bin2hex(mb_convert_encoding($compare, "UTF-16", "UTF-8")); | ||
throw new Exception("mb_levenshtein compare error: {$mb_levenshtein_score} !== {$keinos_mb_levenshtein_score} param: {$hexbytes} vs {$hexcompare}"); | ||
} | ||
} | ||
echo "OK" . PHP_EOL; | ||
echo '--- Usecase of Emoji ---' . \PHP_EOL; | ||
var_dump(mb_levenshtein("🙇♀️", "🙇♂️")); | ||
var_dump(mb_levenshtein("🙇", "🙇♂️")); | ||
?> | ||
--EXPECT-- | ||
--- Equal --- | ||
int(0) | ||
--- First string empty --- | ||
int(3) | ||
--- Second string empty --- | ||
int(3) | ||
--- Both empty --- | ||
int(0) | ||
int(0) | ||
--- 1 character --- | ||
int(1) | ||
--- 2 character swapped --- | ||
int(2) | ||
--- Inexpensive deletion --- | ||
int(2) | ||
--- Expensive deletion --- | ||
int(10) | ||
--- Inexpensive insertion --- | ||
int(2) | ||
--- Expensive insertion --- | ||
int(10) | ||
--- Expensive replacement --- | ||
int(3) | ||
--- Very expensive replacement --- | ||
int(4) | ||
--- 128 codepoints --- | ||
int(2) | ||
--- 128 codepoints over --- | ||
int(2) | ||
int(256) | ||
--- 128 codepoints over only $string1 --- | ||
int(128) | ||
--- 128 codepoints over only $string2 --- | ||
int(130) | ||
--- 128 codepoints over Hiragana --- | ||
int(2) | ||
--- 128 codepoints over Hiragana in Shift_JIS --- | ||
int(2) | ||
--- café in ISO-8859-1 --- | ||
int(1) | ||
--- Variable selector --- | ||
int(1) | ||
int(1) | ||
--- Usecase of userland code --- | ||
OK | ||
--- Usecase of Emoji --- | ||
int(1) | ||
int(3) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
<?php | ||
/* from: https://github.com/arabcoders/watchstate/blob/18e048b3a692aa0f46d34bde03c8518854907e4f/src/API/Backend/Mismatched.php#L331 */ | ||
/** | ||
* Copyright (c) 2024 ArabCoders | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is furnished | ||
* to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in all | ||
* copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
|
||
/** | ||
* Implementation levenshtein distance algorithm. | ||
* | ||
* @param string $str1 The first string. | ||
* @param string $str2 The second string. | ||
* | ||
* @return int The Levenshtein distance between the two strings. | ||
*/ | ||
function watchstate_mb_levenshtein(string $str1, string $str2): int | ||
{ | ||
$length1 = mb_strlen($str1, 'UTF-8'); | ||
$length2 = mb_strlen($str2, 'UTF-8'); | ||
|
||
if ($length1 < $length2) { | ||
return watchstate_mb_levenshtein($str2, $str1); | ||
} | ||
|
||
if (0 === $length1) { | ||
return $length2; | ||
} | ||
|
||
if ($str1 === $str2) { | ||
return 0; | ||
} | ||
|
||
$prevRow = range(0, $length2); | ||
|
||
for ($i = 0; $i < $length1; $i++) { | ||
$currentRow = []; | ||
$currentRow[0] = $i + 1; | ||
$c1 = mb_substr($str1, $i, 1, 'UTF-8'); | ||
|
||
for ($j = 0; $j < $length2; $j++) { | ||
$c2 = mb_substr($str2, $j, 1, 'UTF-8'); | ||
$insertions = $prevRow[$j + 1] + 1; | ||
$deletions = $currentRow[$j] + 1; | ||
$substitutions = $prevRow[$j] + (($c1 !== $c2) ? 1 : 0); | ||
$currentRow[] = min($insertions, $deletions, $substitutions); | ||
} | ||
|
||
$prevRow = $currentRow; | ||
} | ||
return $prevRow[$length2]; | ||
} | ||
|
Uh oh!
There was an error while loading. Please reload this page.