php · youkidearitai · Sep 24, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
@@ -3166,6 +3166,98 @@ PHP_FUNCTION(mb_rtrim)
 	php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
 }
 
+PHP_FUNCTION(mb_levenshtein)
+{
+	zend_string *string1, *string2, *enc_name = NULL;
+	zend_long cost_ins = 1;
+	zend_long cost_rep = 1;
+	zend_long cost_del = 1;
+
+	ZEND_PARSE_PARAMETERS_START(2, 6)
+		Z_PARAM_STR(string1)
+		Z_PARAM_STR(string2)
+		Z_PARAM_OPTIONAL
+		Z_PARAM_LONG(cost_ins)
+		Z_PARAM_LONG(cost_rep)
+		Z_PARAM_LONG(cost_del)
+		Z_PARAM_STR_OR_NULL(enc_name)
+	ZEND_PARSE_PARAMETERS_END();
+
+	const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 6);
+	if (!enc) {
+		RETURN_THROWS();
+	}
+
+	/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
+	 * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
+	 * by having shorter rows (p1 & p2). */
+	if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
+		zend_string *tmp = string1;
+		string1 = string2;
+		string2 = tmp;
+	}
+
+	uint32_t wchar_buf_1[1], wchar_buf_2[1];
+	size_t i1, i2;
+	zend_long *p1, *p2, *tmp;
+	size_t strlen_1 = mb_get_strlen(string1, enc);
+	size_t strlen_2 = mb_get_strlen(string2, enc);
+	size_t in_len_1 = ZSTR_LEN(string1);
+	size_t in_len_2 = ZSTR_LEN(string2);
+	unsigned char *in_1 = (unsigned char*)ZSTR_VAL(string1);
+	unsigned char *in_2 = (unsigned char*)ZSTR_VAL(string2);
+	unsigned int state = 0;
+
+	if (strlen_1 == 0) {
+		RETURN_LONG(strlen_2 * cost_ins);
+	}
+
+	if (strlen_2 == 0) {
+		RETURN_LONG(strlen_1 * cost_ins);
+	}
+
+	zend_long c0, c1, c2;
+
+	p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
+	p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
+
+	for (i2 = 0; i2 <= strlen_2; i2++) {
+		p1[i2] = i2 * cost_ins;
+	}
+	zend_long tmp_wchar_len;
+
+	for (i1 = 0; i1 < strlen_1; i1++) {
+		tmp_wchar_len = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 1, &state);
+		ZEND_ASSERT(tmp_wchar_len <= 1);
+		p2[0] = p1[0] + cost_del;
+		for (i2 = 0; i2 < strlen_2; i2++) {
+			tmp_wchar_len = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 1, &state);
+			ZEND_ASSERT(tmp_wchar_len <= 1);
+			c0 = p1[i2] + (wchar_buf_1[0] == wchar_buf_2[0] ? 0 : cost_rep);
+			c1 = p1[i2 + 1] + cost_del;
+			if (c1 < c0) {
+				c0 = c1;
+			}
+			c2 = p2[i2] + cost_ins;
+			if (c2 < c0) {
+				c0 = c2;
+			}
+			p2[i2 + 1] = c0;
+		}
+		in_2 = (unsigned char*)ZSTR_VAL(string2);
+		in_len_2 = ZSTR_LEN(string2);
+		tmp = p1;
+		p1 = p2;
+		p2 = tmp;
+	}
+
+	c0 = p1[strlen_2];
+	efree(p1);
+	efree(p2);
+
+	RETURN_LONG(c0);
+}
+
 static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
 {
 	const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);

@@ -145,6 +145,8 @@ function mb_ltrim(string $string, ?string $characters = null, ?string $encoding
 
 function mb_rtrim(string $string, ?string $characters = null, ?string $encoding = null): string {}
 
+function mb_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, ?string $encoding = null): int {}
+
 /** @refcount 1 */
 function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}
 

@@ -0,0 +1,139 @@
+--TEST--
+mb_levenshtein() function test
+--EXTENSIONS--
+mbstring
+--FILE--
+<?php
+require_once("mb_levenshtein_userland.inc");
+echo '--- Equal ---' . \PHP_EOL;
+var_dump(mb_levenshtein('12345', '12345'));
+
+echo '--- First string empty ---' . \PHP_EOL;
+var_dump(mb_levenshtein('', 'xyz'));
+echo '--- Second string empty ---' . \PHP_EOL;
+var_dump(mb_levenshtein('xyz', ''));
+echo '--- Both empty ---' . \PHP_EOL;
+var_dump(mb_levenshtein('', ''));
+var_dump(mb_levenshtein('', '', 10, 10, 10));
+
+echo '--- 1 character ---' . \PHP_EOL;
+var_dump(mb_levenshtein('1', '2'));
+echo '--- 2 character swapped ---' . \PHP_EOL;
+var_dump(mb_levenshtein('12', '21'));
+
+echo '--- Inexpensive deletion ---' . \PHP_EOL;
+var_dump(mb_levenshtein('2121', '11', 2));
+echo '--- Expensive deletion ---' . \PHP_EOL;
+var_dump(mb_levenshtein('2121', '11', 2, 1, 5));
+
+echo '--- Inexpensive insertion ---' . \PHP_EOL;
+var_dump(mb_levenshtein('11', '2121'));
+echo '--- Expensive insertion ---' . \PHP_EOL;
+var_dump(mb_levenshtein('11', '2121', 5));
+
+echo '--- Expensive replacement ---' . \PHP_EOL;
+var_dump(mb_levenshtein('111', '121', 2, 3, 2));
+echo '--- Very expensive replacement ---' . \PHP_EOL;
+var_dump(mb_levenshtein('111', '121', 2, 9, 2));
+
+echo '--- 128 codepoints ---' . \PHP_EOL;
+var_dump(mb_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc"));
+echo '--- 128 codepoints over ---' . \PHP_EOL;
+var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa"));
+var_dump(mb_levenshtein(str_repeat("a", 256) . "abc", "aaa"));
+echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL;
+var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", "aaa"));
+echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL;
+var_dump(mb_levenshtein("abc", str_repeat("a", 128) . "aaa"));
+echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL;
+var_dump(mb_levenshtein(str_repeat("あ", 128) . "あああ", str_repeat("あ", 128) . "あいう"));
+
+echo '--- 128 codepoints over Hiragana in Shift_JIS ---' . \PHP_EOL;
+$hiragana_a = mb_convert_encoding("あ", "SJIS", "UTF-8");
+$hiragana_aiu = mb_convert_encoding("あいう", "SJIS", "UTF-8");
+var_dump(mb_levenshtein(str_repeat($hiragana_a, 128 + 3), str_repeat($hiragana_a, 128) . $hiragana_aiu, encoding: "SJIS"));
+
+echo '--- café in ISO-8859-1 ---' . \PHP_EOL;
+$cafe = mb_convert_encoding("café", "ISO-8859-1", "UTF-8");
+var_dump(mb_levenshtein("cafe", $cafe, encoding: "ISO-8859-1"));
+
+echo '--- Variable selector ---' . \PHP_EOL;
+$ka = "カ́";
+var_dump(mb_levenshtein("カ", $ka, encoding: "UTF-8"));
+// variable $nabe and $nabe_E0100 is seems nothing different.
+// However, $nabe_E0100 is variable selector in U+908A U+E0100.
+// Therfore, this result is 1.
+$nabe = '邊';
+$nabe_E0100 = "邊󠄀";
+var_dump(mb_levenshtein($nabe, $nabe_E0100, encoding: "UTF-8"));
+
+echo '--- Usecase of userland code ---' . \PHP_EOL;
+
+$bytes = "";
+for ($i = 0; $i < 100; $i++) {
+	for ($j = 0; $j < 10; $j++) {
+		$bytes .= mb_chr(mt_rand(0, 0xFFFF));
+	}
+	$compare = "あいうえおABCDEF";
+	$mb_levenshtein_score = mb_levenshtein($bytes, $compare, encoding: "UTF-8");
+	$watchstate_mb_levenshtein_score = watchstate_mb_levenshtein($bytes, $compare);
+	if ($mb_levenshtein_score !== $watchstate_mb_levenshtein_score) {
+		$hexbytes = bin2hex(mb_convert_encoding($bytes, "UTF-16", "UTF-8"));
+		$hexcompare = bin2hex(mb_convert_encoding($compare, "UTF-16", "UTF-8"));
+		throw new Exception("mb_levenshtein compare error: {$mb_levenshtein_score} !== {$keinos_mb_levenshtein_score} param: {$hexbytes} vs {$hexcompare}");
+	}
+}
+echo "OK" . PHP_EOL;
+echo '--- Usecase of Emoji ---' . \PHP_EOL;
+var_dump(mb_levenshtein("🙇‍♀️", "🙇‍♂️"));
+var_dump(mb_levenshtein("🙇", "🙇‍♂️"));
+?>
+--EXPECT--
+--- Equal ---
+int(0)
+--- First string empty ---
+int(3)
+--- Second string empty ---
+int(3)
+--- Both empty ---
+int(0)
+int(0)
+--- 1 character ---
+int(1)
+--- 2 character swapped ---
+int(2)
+--- Inexpensive deletion ---
+int(2)
+--- Expensive deletion ---
+int(10)
+--- Inexpensive insertion ---
+int(2)
+--- Expensive insertion ---
+int(10)
+--- Expensive replacement ---
+int(3)
+--- Very expensive replacement ---
+int(4)
+--- 128 codepoints ---
+int(2)
+--- 128 codepoints over ---
+int(2)
+int(256)
+--- 128 codepoints over only $string1 ---
+int(128)
+--- 128 codepoints over only $string2 ---
+int(130)
+--- 128 codepoints over Hiragana ---
+int(2)
+--- 128 codepoints over Hiragana in Shift_JIS ---
+int(2)
+--- café in ISO-8859-1 ---
+int(1)
+--- Variable selector ---
+int(1)
+int(1)
+--- Usecase of userland code ---
+OK
+--- Usecase of Emoji ---
+int(1)
+int(3)
@@ -0,0 +1,69 @@
+<?php
+/* from: https://github.com/arabcoders/watchstate/blob/18e048b3a692aa0f46d34bde03c8518854907e4f/src/API/Backend/Mismatched.php#L331 */
+/**
+ * Copyright (c) 2024 ArabCoders
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/**
+ * Implementation levenshtein distance algorithm.
+ *
+ * @param string $str1 The first string.
+ * @param string $str2 The second string.
+ *
+ * @return int The Levenshtein distance between the two strings.
+ */
+function watchstate_mb_levenshtein(string $str1, string $str2): int
+{
+	$length1 = mb_strlen($str1, 'UTF-8');
+	$length2 = mb_strlen($str2, 'UTF-8');
+
+	if ($length1 < $length2) {
+		return watchstate_mb_levenshtein($str2, $str1);
+	}
+
+	if (0 === $length1) {
+		return $length2;
+	}
+
+	if ($str1 === $str2) {
+		return 0;
+	}
+
+	$prevRow = range(0, $length2);
+
+	for ($i = 0; $i < $length1; $i++) {
+		$currentRow = [];
+		$currentRow[0] = $i + 1;
+		$c1 = mb_substr($str1, $i, 1, 'UTF-8');
+
+		for ($j = 0; $j < $length2; $j++) {
+			$c2 = mb_substr($str2, $j, 1, 'UTF-8');
+			$insertions = $prevRow[$j + 1] + 1;
+			$deletions = $currentRow[$j] + 1;
+			$substitutions = $prevRow[$j] + (($c1 !== $c2) ? 1 : 0);
+			$currentRow[] = min($insertions, $deletions, $substitutions);
+		}
+
+		$prevRow = $currentRow;
+	}
+	return $prevRow[$length2];
+}
+