Skip to content

Commit af72b0b

Browse files
committed
[Draft][Require RFC] mb_levenshtein function
1 parent 090b53b commit af72b0b

File tree

4 files changed

+220
-1
lines changed

4 files changed

+220
-1
lines changed

ext/mbstring/mbstring.c

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3166,6 +3166,126 @@ PHP_FUNCTION(mb_rtrim)
31663166
php_do_mb_trim(INTERNAL_FUNCTION_PARAM_PASSTHRU, MB_RTRIM);
31673167
}
31683168

3169+
PHP_FUNCTION(mb_levenshtein)
3170+
{
3171+
zend_string *string1, *string2, *enc_name = NULL;
3172+
zend_long cost_ins = 1;
3173+
zend_long cost_rep = 1;
3174+
zend_long cost_del = 1;
3175+
3176+
ZEND_PARSE_PARAMETERS_START(2, 6)
3177+
Z_PARAM_STR(string1)
3178+
Z_PARAM_STR(string2)
3179+
Z_PARAM_OPTIONAL
3180+
Z_PARAM_LONG(cost_ins)
3181+
Z_PARAM_LONG(cost_rep)
3182+
Z_PARAM_LONG(cost_del)
3183+
Z_PARAM_STR_OR_NULL(enc_name)
3184+
ZEND_PARSE_PARAMETERS_END();
3185+
3186+
if (ZSTR_LEN(string1) == 0) {
3187+
RETVAL_LONG(ZSTR_LEN(string2) * cost_ins);
3188+
}
3189+
3190+
if (ZSTR_LEN(string2) == 0) {
3191+
RETVAL_LONG(ZSTR_LEN(string1) * cost_del);
3192+
}
3193+
3194+
const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 6);
3195+
if (!enc) {
3196+
RETURN_THROWS();
3197+
}
3198+
3199+
/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
3200+
* that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
3201+
* by having shorter rows (p1 & p2). */
3202+
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
3203+
zend_string *tmp = string1;
3204+
string1 = string2;
3205+
string2 = tmp;
3206+
}
3207+
3208+
uint32_t wchar_buf_1[128], wchar_buf_2[128];
3209+
size_t i1, i2;
3210+
zend_long *p1, *p2, *tmp;
3211+
size_t strlen_1 = mb_get_strlen(string1, enc);
3212+
size_t strlen_2 = mb_get_strlen(string2, enc);
3213+
size_t len_1 = 0;
3214+
size_t len_2 = 0;
3215+
size_t in_len_1 = ZSTR_LEN(string1);
3216+
size_t in_len_2 = ZSTR_LEN(string2);
3217+
unsigned char *in_1 = (unsigned char*)ZSTR_VAL(string1);
3218+
unsigned char *in_2 = (unsigned char*)ZSTR_VAL(string2);
3219+
unsigned int state = 0;
3220+
3221+
zend_long c0, c1, c2;
3222+
3223+
p1 = safe_emalloc(strlen_1, sizeof(zend_long), 0);
3224+
p2 = safe_emalloc(strlen_2, sizeof(zend_long), 0);
3225+
3226+
for (i2 = 0; i2 <= strlen_2; i2++) {
3227+
p1[i2] = i2 * cost_ins;
3228+
}
3229+
3230+
zend_long tmp_wchar_len_1 = 0;
3231+
zend_long tmp_wchar_len_2 = 0;
3232+
bool first = true;
3233+
3234+
while (in_len_1) {
3235+
tmp_wchar_len_1 = enc->to_wchar(&in_1, &in_len_1, wchar_buf_1, 128, &state);
3236+
len_1 += tmp_wchar_len_1;
3237+
ZEND_ASSERT(in_len_1 <= 128);
3238+
tmp_wchar_len_2 = enc->to_wchar(&in_2, &in_len_2, wchar_buf_2, 128, &state);
3239+
len_2 += tmp_wchar_len_2;
3240+
ZEND_ASSERT(in_len_2 <= 128);
3241+
3242+
for (i1 = 0; i1 < tmp_wchar_len_1; i1++) {
3243+
/* First loop that does not cross a 128 code points */
3244+
if (first) {
3245+
p2[0] = p1[0] + cost_del;
3246+
}
3247+
/* Insertion process when there is a surplus of 128 code points. */
3248+
if (tmp_wchar_len_2 == 0) {
3249+
for (i2 = 0; i2 < tmp_wchar_len_1; i2++) {
3250+
c0 = p1[i2 + (len_2 - tmp_wchar_len_1)] + cost_rep;
3251+
c1 = p1[i2 + (len_2 - tmp_wchar_len_1) + 1] + cost_del;
3252+
if (c1 < c0) {
3253+
c0 = c1;
3254+
}
3255+
c2 = p2[i2 + (len_2 - tmp_wchar_len_1)] + cost_ins;
3256+
if (c2 < c0) {
3257+
c0 = c2;
3258+
}
3259+
p2[i2 + (len_2 - tmp_wchar_len_1) + 1] = c0;
3260+
}
3261+
} else {
3262+
for (i2 = 0; i2 < tmp_wchar_len_2; i2++) {
3263+
c0 = p1[i2 + (len_2 - tmp_wchar_len_2)] + (wchar_buf_1[i1] == wchar_buf_2[i2] ? 0 : cost_rep);
3264+
c1 = p1[i2 + (len_2 - tmp_wchar_len_2) + 1] + cost_del;
3265+
if (c1 < c0) {
3266+
c0 = c1;
3267+
}
3268+
c2 = p2[i2 + (len_2 - tmp_wchar_len_2)] + cost_ins;
3269+
if (c2 < c0) {
3270+
c0 = c2;
3271+
}
3272+
p2[i2 + (len_2 - tmp_wchar_len_2) + 1] = c0;
3273+
}
3274+
}
3275+
tmp = p1;
3276+
p1 = p2;
3277+
p2 = tmp;
3278+
}
3279+
first = false;
3280+
}
3281+
3282+
c0 = p1[strlen_2];
3283+
efree(p1);
3284+
efree(p2);
3285+
3286+
RETVAL_LONG(c0);
3287+
}
3288+
31693289
static const mbfl_encoding **duplicate_elist(const mbfl_encoding **elist, size_t size)
31703290
{
31713291
const mbfl_encoding **new_elist = safe_emalloc(size, sizeof(mbfl_encoding*), 0);

ext/mbstring/mbstring.stub.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ function mb_ltrim(string $string, ?string $characters = null, ?string $encoding
145145

146146
function mb_rtrim(string $string, ?string $characters = null, ?string $encoding = null): string {}
147147

148+
function mb_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1, ?string $encoding = null): int {}
149+
148150
/** @refcount 1 */
149151
function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false {}
150152

ext/mbstring/mbstring_arginfo.h

Lines changed: 12 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
--TEST--
2+
mb_levenshtein() function test
3+
--FILE--
4+
<?php
5+
6+
echo '--- Equal ---' . \PHP_EOL;
7+
var_dump(mb_levenshtein('12345', '12345'));
8+
9+
echo '--- First string empty ---' . \PHP_EOL;
10+
var_dump(mb_levenshtein('', 'xyz'));
11+
echo '--- Second string empty ---' . \PHP_EOL;
12+
var_dump(mb_levenshtein('xyz', ''));
13+
echo '--- Both empty ---' . \PHP_EOL;
14+
var_dump(mb_levenshtein('', ''));
15+
var_dump(mb_levenshtein('', '', 10, 10, 10));
16+
17+
echo '--- 1 character ---' . \PHP_EOL;
18+
var_dump(mb_levenshtein('1', '2'));
19+
echo '--- 2 character swapped ---' . \PHP_EOL;
20+
var_dump(mb_levenshtein('12', '21'));
21+
22+
echo '--- Inexpensive deletion ---' . \PHP_EOL;
23+
var_dump(mb_levenshtein('2121', '11', 2));
24+
echo '--- Expensive deletion ---' . \PHP_EOL;
25+
var_dump(mb_levenshtein('2121', '11', 2, 1, 5));
26+
27+
echo '--- Inexpensive insertion ---' . \PHP_EOL;
28+
var_dump(mb_levenshtein('11', '2121'));
29+
echo '--- Expensive insertion ---' . \PHP_EOL;
30+
var_dump(mb_levenshtein('11', '2121', 5));
31+
32+
echo '--- Expensive replacement ---' . \PHP_EOL;
33+
var_dump(mb_levenshtein('111', '121', 2, 3, 2));
34+
echo '--- Very expensive replacement ---' . \PHP_EOL;
35+
var_dump(mb_levenshtein('111', '121', 2, 9, 2));
36+
37+
echo '--- 128 codepoints over ---' . \PHP_EOL;
38+
var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa"));
39+
echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL;
40+
var_dump(mb_levenshtein(str_repeat("a", 128) . "abc", "aaa"));
41+
echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL;
42+
var_dump(mb_levenshtein("abc", str_repeat("a", 128) . "aaa"));
43+
echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL;
44+
var_dump(mb_levenshtein(str_repeat("", 128) . "あああ", str_repeat("", 128) . "あいう"));
45+
46+
echo '--- 128 codepoints over Hiragana in Shift_JIS ---' . \PHP_EOL;
47+
$hiragana_a = mb_convert_encoding("", "SJIS", "UTF-8");
48+
$hiragana_aiu = mb_convert_encoding("あいう", "SJIS", "UTF-8");
49+
var_dump(mb_levenshtein(str_repeat($hiragana_a, 128 + 3), str_repeat($hiragana_a, 128) . $hiragana_aiu, encoding: "SJIS"));
50+
?>
51+
--EXPECT--
52+
--- Equal ---
53+
int(0)
54+
--- First string empty ---
55+
int(3)
56+
--- Second string empty ---
57+
int(3)
58+
--- Both empty ---
59+
int(0)
60+
int(0)
61+
--- 1 character ---
62+
int(1)
63+
--- 2 character swapped ---
64+
int(2)
65+
--- Inexpensive deletion ---
66+
int(2)
67+
--- Expensive deletion ---
68+
int(10)
69+
--- Inexpensive insertion ---
70+
int(2)
71+
--- Expensive insertion ---
72+
int(10)
73+
--- Expensive replacement ---
74+
int(3)
75+
--- Very expensive replacement ---
76+
int(4)
77+
--- 128 codepoints over ---
78+
int(2)
79+
--- 128 codepoints over only $string1 ---
80+
int(128)
81+
--- 128 codepoints over only $string2 ---
82+
int(130)
83+
--- 128 codepoints over Hiragana ---
84+
int(2)
85+
--- 128 codepoints over Hiragana in Shift_JIS ---
86+
int(2)

0 commit comments

Comments
 (0)