Skip to content

Commit d27cfa0

Browse files
committed
Update test code using from KEINOS/mb_levenshtein
A userland function to compare multiple code points. We uses that code for test code.
1 parent cf56777 commit d27cfa0

File tree

2 files changed

+97
-44
lines changed

2 files changed

+97
-44
lines changed

ext/mbstring/tests/mb_levenshtein.phpt

Lines changed: 9 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ mb_levenshtein() function test
44
mbstring
55
--FILE--
66
<?php
7-
7+
require_once("mb_levenshtein_userland.inc");
88
echo '--- Equal ---' . \PHP_EOL;
99
var_dump(mb_levenshtein('12345', '12345'));
1010

@@ -57,43 +57,15 @@ $cafe = mb_convert_encoding("café", "ISO-8859-1", "UTF-8");
5757
var_dump(mb_levenshtein("cafe", $cafe, encoding: "ISO-8859-1"));
5858

5959
echo '--- Usecase of userland code ---' . \PHP_EOL;
60-
/* from: https://qiita.com/mpyw/items/2b636827730e06c71e3d */
61-
$query = 'ほあようごぁいまーしゅ';
62-
$comps = [
63-
'こんにちはー',
64-
'おはようございまーす',
65-
'こんばんはー',
66-
'おやすみなさーい',
67-
'いただきまーす',
68-
'おつかれさまー',
69-
'ぬぁあああんつかれたもぉぉぉぉぉぉん',
70-
];
71-
$min = 99999;
72-
$min_key = 0;
73-
foreach ($comps as $key => $comp) {
74-
$sim = mb_levenshtein($query, $comp);
75-
if ($min >= $sim) {
76-
$min = $sim;
77-
$min_key = $key;
78-
}
79-
}
80-
var_dump($comps[$min_key]);
8160

82-
$base = 'やんほぬ';
83-
$comps = [
84-
'かんのみほ',
85-
'かんのみほう',
86-
'かんぺみろ',
87-
'ああいいふろ',
88-
'ちゃんとみろ',
89-
'ターミナルさん',
90-
];
91-
foreach ($comps as $comp) {
92-
var_dump(mb_levenshtein($base, $comp));
61+
for ($i = 0; $i < 100; $i++) {
62+
$bytes = implode("", array_map(function ($byte) { return mb_chr(intval(bin2hex($byte), 16), "UTF-8"); }, str_split(random_bytes(10))));
63+
$compare = "あいうえおABCDEF";
64+
if (mb_levenshtein($bytes, $compare) !== keinos_mb_levenshtein($bytes, $compare)) {
65+
throw Exception("mb_levenshtein compare error: {$bytes}");
66+
}
9367
}
94-
95-
/* from: https://qiita.com/suin/items/a0a8227addad11ff2ea7 */
96-
var_dump(mb_levenshtein('あとうかい', 'かとうあい')); // int(2)
68+
echo "OK" . PHP_EOL;
9769
?>
9870
--EXPECT--
9971
--- Equal ---
@@ -136,11 +108,4 @@ int(2)
136108
--- café in ISO-8859-1 ---
137109
int(1)
138110
--- Usecase of userland code ---
139-
string(30) "おはようございまーす"
140-
int(4)
141-
int(4)
142-
int(4)
143-
int(6)
144-
int(5)
145-
int(7)
146-
int(2)
111+
OK
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
<?php
2+
/* from: https://github.com/KEINOS/mb_levenshtein/blob/master/mb_levenshtein.php
3+
* MIT License
4+
*
5+
* Copyright (c) 2021 KEINOS and the contributors (https://git.io/J9Gmd)
6+
*
7+
* Permission is hereby granted, free of charge, to any person obtaining a copy
8+
* of this software and associated documentation files (the "Software"), to deal
9+
* in the Software without restriction, including without limitation the rights
10+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
* copies of the Software, and to permit persons to whom the Software is
12+
* furnished to do so, subject to the following conditions:
13+
*
14+
* The above copyright notice and this permission notice shall be included in all
15+
* copies or substantial portions of the Software.
16+
*
17+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
* SOFTWARE.
24+
*/
25+
/**
26+
* mb_levenshtein.
27+
*
28+
* Didactic example showing the usage of the previous conversion function.
29+
* But for better performance, in a real application with a single input string
30+
* matched against many strings from a database, you will probably want to pre-
31+
* encode the input only once.
32+
*
33+
* @param string $s1 One of the strings being evaluated for Levenshtein distance.
34+
* @param string $s2 One of the strings being evaluated for Levenshtein distance.
35+
* @param integer $cost_ins Defines the cost of insertion.
36+
* @param integer $cost_rep Defines the cost of replacement.
37+
* @param integer $cost_del Defines the cost of deletion.
38+
*
39+
* @return integer
40+
*/
41+
function keinos_mb_levenshtein($s1, $s2, $cost_ins = 1, $cost_rep = 1, $cost_del = 1)
42+
{
43+
$charMap = array();
44+
convert_mb_ascii($s1, $charMap);
45+
convert_mb_ascii($s2, $charMap);
46+
47+
return levenshtein($s1, $s2, $cost_ins, $cost_rep, $cost_del);
48+
}
49+
50+
/**
51+
* convert_mb_ascii.
52+
*
53+
* Convert an UTF-8 encoded string to a single-byte string suitable for
54+
* functions such as levenshtein.
55+
*
56+
* The function simply uses (and updates) a tailored dynamic encoding
57+
* (in/out map parameter) where non-ascii characters are remapped to
58+
* the range [128-255] in order of appearance.
59+
*
60+
* Thus it supports up to 128 different multibyte code points max over
61+
* the whole set of strings sharing this encoding.
62+
*
63+
* @param string $str UTF-8 string to be converted to extended ASCII.
64+
* @param array $map Reference of the map.
65+
*
66+
* @return void
67+
*/
68+
function convert_mb_ascii(&$str, &$map)
69+
{
70+
// find all utf-8 characters
71+
$matches = array();
72+
if (! preg_match_all('/[\xC0-\xF7][\x80-\xBF]+/', $str, $matches)) {
73+
return; // plain ascii string
74+
}
75+
76+
// update the encoding map with the characters not already met
77+
$mapCount = count($map);
78+
foreach ($matches[0] as $mbc) {
79+
if (! isset($map[$mbc])) {
80+
$map[$mbc] = chr(128 + $mapCount);
81+
$mapCount++;
82+
}
83+
}
84+
85+
// finally remap non-ascii characters
86+
$str = strtr($str, $map);
87+
}
88+

0 commit comments

Comments
 (0)