Skip to content

Commit d784874

Browse files
committed
Add grapheme_levenshtein function.
Measure levenshtein for grapheme cluster unit
1 parent 41c55d1 commit d784874

File tree

4 files changed

+298
-1
lines changed

4 files changed

+298
-1
lines changed

ext/intl/grapheme/grapheme_string.c

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -917,4 +917,185 @@ PHP_FUNCTION(grapheme_str_split)
917917
ubrk_close(bi);
918918
}
919919

920+
PHP_FUNCTION(grapheme_levenshtein)
921+
{
922+
zend_string *string1, *string2;
923+
zend_long cost_ins = 1;
924+
zend_long cost_rep = 1;
925+
zend_long cost_del = 1;
926+
927+
ZEND_PARSE_PARAMETERS_START(2, 5)
928+
Z_PARAM_STR(string1)
929+
Z_PARAM_STR(string2)
930+
Z_PARAM_OPTIONAL
931+
Z_PARAM_LONG(cost_ins)
932+
Z_PARAM_LONG(cost_rep)
933+
Z_PARAM_LONG(cost_del)
934+
ZEND_PARSE_PARAMETERS_END();
935+
936+
if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
937+
zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
938+
RETURN_THROWS();
939+
}
940+
941+
if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) {
942+
zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
943+
RETURN_THROWS();
944+
}
945+
946+
if (cost_del <= 0 || cost_del > UINT_MAX / 4) {
947+
zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
948+
RETURN_THROWS();
949+
}
950+
951+
zend_long *p1, *p2, *tmp;
952+
zend_long c0, c1, c2;
953+
zend_long retval;
954+
size_t i2;
955+
char *pstr1, *pstr2;
956+
957+
UChar *ustring1 = NULL;
958+
UChar *ustring2 = NULL;
959+
960+
int32_t ustring1_len = 0;
961+
int32_t ustring2_len = 0;
962+
963+
UErrorCode ustatus1 = U_ZERO_ERROR;
964+
UErrorCode ustatus2 = U_ZERO_ERROR;
965+
966+
/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
967+
* that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time)
968+
* by having shorter rows (p1 & p2). */
969+
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
970+
zend_string *tmp = string1;
971+
string1 = string2;
972+
string2 = tmp;
973+
}
974+
975+
pstr1 = ZSTR_VAL(string1);
976+
pstr2 = ZSTR_VAL(string2);
977+
978+
intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus1);
979+
980+
if ( U_FAILURE( ustatus1 ) ) {
981+
/* Set global error code. */
982+
intl_error_set_code( NULL, ustatus1 );
983+
984+
/* Set error messages. */
985+
intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
986+
if (ustring1) {
987+
efree( ustring1 );
988+
}
989+
RETURN_FALSE;
990+
}
991+
992+
intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus2);
993+
994+
if ( U_FAILURE( ustatus2 ) ) {
995+
/* Set global error code. */
996+
intl_error_set_code( NULL, ustatus2 );
997+
998+
/* Set error messages. */
999+
intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
1000+
if (ustring2) {
1001+
efree( ustring2 );
1002+
}
1003+
if (ustring1) {
1004+
efree( ustring1 );
1005+
}
1006+
RETURN_FALSE;
1007+
}
1008+
1009+
UText *ut1 = NULL;
1010+
UText *ut2 = NULL;
1011+
UBreakIterator *bi1, *bi2;
1012+
1013+
int32_t strlen_1, strlen_2;
1014+
strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0 );
1015+
strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0 );
1016+
1017+
if (strlen_1 == 0) {
1018+
efree(ustring1);
1019+
efree(ustring2);
1020+
RETURN_LONG(strlen_2 * cost_ins);
1021+
}
1022+
if (strlen_2 == 0) {
1023+
efree(ustring1);
1024+
efree(ustring2);
1025+
RETURN_LONG(strlen_1 * cost_del);
1026+
}
1027+
1028+
unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE];
1029+
unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE];
1030+
bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1 );
1031+
bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2 );
1032+
1033+
ut1 = utext_openUTF8(ut1, pstr1, ZSTR_LEN(string1), &ustatus1);
1034+
ubrk_setUText(bi1, ut1, &ustatus1);
1035+
ut2 = utext_openUTF8(ut2, pstr2, ZSTR_LEN(string2), &ustatus2);
1036+
ubrk_setUText(bi2, ut2, &ustatus2);
1037+
1038+
p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
1039+
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
1040+
1041+
for (i2 = 0; i2 <= strlen_2; i2++) {
1042+
p1[i2] = i2 * cost_ins;
1043+
}
1044+
1045+
int32_t current1 = 0;
1046+
int32_t current2 = 0;
1047+
int32_t pos1 = 0;
1048+
int32_t pos2 = 0;
1049+
int32_t usrch_pos = 0;
1050+
for ( ; pos1 != UBRK_DONE; ) {
1051+
current1 = ubrk_current(bi1);
1052+
pos1 = ubrk_next(bi1);
1053+
if (pos1 == UBRK_DONE) {
1054+
break;
1055+
}
1056+
p2[0] = p1[0] + cost_del;
1057+
for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) {
1058+
current2 = ubrk_current(bi2);
1059+
pos2 = ubrk_next(bi2);
1060+
if (pos2 == UBRK_DONE) {
1061+
break;
1062+
}
1063+
usrch_pos = grapheme_strpos_utf16(pstr1 + current1, pos1 - current1, pstr2 + current2, pos2 - current2, 0, NULL, 0, 0);
1064+
if (usrch_pos == 0) {
1065+
c0 = p1[i2];
1066+
} else {
1067+
c0 = p1[i2] + cost_rep;
1068+
}
1069+
c1 = p1[i2 + 1] + cost_del;
1070+
if (c1 < c0) {
1071+
c0 = c1;
1072+
}
1073+
c2 = p2[i2] + cost_ins;
1074+
if (c2 < c0) {
1075+
c0 = c2;
1076+
}
1077+
p2[i2 + 1] = c0;
1078+
}
1079+
ubrk_first(bi2);
1080+
tmp = p1;
1081+
p1 = p2;
1082+
p2 = tmp;
1083+
}
1084+
1085+
utext_close(ut1);
1086+
utext_close(ut2);
1087+
1088+
ubrk_close(bi1);
1089+
ubrk_close(bi2);
1090+
1091+
efree(ustring1);
1092+
efree(ustring2);
1093+
1094+
retval = p1[strlen_2];
1095+
1096+
efree(p1);
1097+
efree(p2);
1098+
RETURN_LONG(retval);
1099+
}
1100+
9201101
/* }}} */

ext/intl/php_intl.stub.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle =
447447

448448
function grapheme_str_split(string $string, int $length = 1): array|false {}
449449

450+
function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}
451+
450452
/** @param int $next */
451453
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}
452454

ext/intl/php_intl_arginfo.h

Lines changed: 11 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
--TEST--
2+
grapheme_levenshtein() function test
3+
--EXTENSIONS--
4+
intl
5+
--FILE--
6+
<?php
7+
echo '--- Equal ---' . \PHP_EOL;
8+
var_dump(grapheme_levenshtein('12345', '12345'));
9+
10+
echo '--- First string empty ---' . \PHP_EOL;
11+
var_dump(grapheme_levenshtein('', 'xyz'));
12+
echo '--- Second string empty ---' . \PHP_EOL;
13+
var_dump(grapheme_levenshtein('xyz', ''));
14+
echo '--- Both empty ---' . \PHP_EOL;
15+
var_dump(grapheme_levenshtein('', ''));
16+
var_dump(grapheme_levenshtein('', '', 10, 10, 10));
17+
18+
echo '--- 1 character ---' . \PHP_EOL;
19+
var_dump(grapheme_levenshtein('1', '2'));
20+
echo '--- 2 character swapped ---' . \PHP_EOL;
21+
var_dump(grapheme_levenshtein('12', '21'));
22+
23+
echo '--- Inexpensive deletion ---' . \PHP_EOL;
24+
var_dump(grapheme_levenshtein('2121', '11', 2));
25+
echo '--- Expensive deletion ---' . \PHP_EOL;
26+
var_dump(grapheme_levenshtein('2121', '11', 2, 1, 5));
27+
28+
//
29+
echo '--- Inexpensive insertion ---' . \PHP_EOL;
30+
var_dump(grapheme_levenshtein('11', '2121'));
31+
echo '--- Expensive insertion ---' . \PHP_EOL;
32+
var_dump(grapheme_levenshtein('11', '2121', 5));
33+
34+
echo '--- Expensive replacement ---' . \PHP_EOL;
35+
var_dump(grapheme_levenshtein('111', '121', 2, 3, 2));
36+
echo '--- Very expensive replacement ---' . \PHP_EOL;
37+
var_dump(grapheme_levenshtein('111', '121', 2, 9, 2));
38+
39+
echo '--- 128 codepoints ---' . \PHP_EOL;
40+
var_dump(grapheme_levenshtein(str_repeat("a", 128), str_repeat("a", 125) . "abc"));
41+
echo '--- 128 codepoints over ---' . \PHP_EOL;
42+
var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", str_repeat("a", 128) . "aaa"));
43+
var_dump(grapheme_levenshtein(str_repeat("a", 256) . "abc", "aaa"));
44+
echo '--- 128 codepoints over only $string1 ---' . \PHP_EOL;
45+
var_dump(grapheme_levenshtein(str_repeat("a", 128) . "abc", "aaa"));
46+
echo '--- 128 codepoints over only $string2 ---' . \PHP_EOL;
47+
var_dump(grapheme_levenshtein("abc", str_repeat("a", 128) . "aaa"));
48+
echo '--- 128 codepoints over Hiragana ---' . \PHP_EOL;
49+
var_dump(grapheme_levenshtein(str_repeat("", 128) . "あああ", str_repeat("", 128) . "あいう"));
50+
51+
echo '--- Variable selector ---' . \PHP_EOL;
52+
$ka = "カ́";
53+
var_dump(grapheme_levenshtein("", $ka));
54+
// variable $nabe and $nabe_E0100 is seems nothing different.
55+
// However, $nabe_E0100 is variable selector in U+908A U+E0100.
56+
// So grapheme_levenshtein result is maybe 0.
57+
$nabe = '';
58+
$nabe_E0100 = "邊󠄀";
59+
var_dump(grapheme_levenshtein($nabe, $nabe_E0100));
60+
61+
// combining character
62+
var_dump(grapheme_levenshtein("\u{0065}\u{0301}", "\u{00e9}"));
63+
?>
64+
--EXPECT--
65+
--- Equal ---
66+
int(0)
67+
--- First string empty ---
68+
int(3)
69+
--- Second string empty ---
70+
int(3)
71+
--- Both empty ---
72+
int(0)
73+
int(0)
74+
--- 1 character ---
75+
int(1)
76+
--- 2 character swapped ---
77+
int(2)
78+
--- Inexpensive deletion ---
79+
int(2)
80+
--- Expensive deletion ---
81+
int(10)
82+
--- Inexpensive insertion ---
83+
int(2)
84+
--- Expensive insertion ---
85+
int(10)
86+
--- Expensive replacement ---
87+
int(3)
88+
--- Very expensive replacement ---
89+
int(4)
90+
--- 128 codepoints ---
91+
int(2)
92+
--- 128 codepoints over ---
93+
int(2)
94+
int(256)
95+
--- 128 codepoints over only $string1 ---
96+
int(128)
97+
--- 128 codepoints over only $string2 ---
98+
int(130)
99+
--- 128 codepoints over Hiragana ---
100+
int(2)
101+
--- Variable selector ---
102+
int(1)
103+
int(0)
104+
int(0)

0 commit comments

Comments
 (0)