|
| 1 | +<?php |
| 2 | +/* from: https://github.com/KEINOS/mb_levenshtein/blob/master/mb_levenshtein.php |
| 3 | + * MIT License |
| 4 | + * |
| 5 | + * Copyright (c) 2021 KEINOS and the contributors (https://git.io/J9Gmd) |
| 6 | + * |
| 7 | + * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 8 | + * of this software and associated documentation files (the "Software"), to deal |
| 9 | + * in the Software without restriction, including without limitation the rights |
| 10 | + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 11 | + * copies of the Software, and to permit persons to whom the Software is |
| 12 | + * furnished to do so, subject to the following conditions: |
| 13 | + * |
| 14 | + * The above copyright notice and this permission notice shall be included in all |
| 15 | + * copies or substantial portions of the Software. |
| 16 | + * |
| 17 | + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 18 | + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 19 | + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 20 | + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 21 | + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 22 | + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 23 | + * SOFTWARE. |
| 24 | + */ |
| 25 | +/** |
| 26 | + * mb_levenshtein. |
| 27 | + * |
| 28 | + * Didactic example showing the usage of the previous conversion function. |
| 29 | + * But for better performance, in a real application with a single input string |
| 30 | + * matched against many strings from a database, you will probably want to pre- |
| 31 | + * encode the input only once. |
| 32 | + * |
| 33 | + * @param string $s1 One of the strings being evaluated for Levenshtein distance. |
| 34 | + * @param string $s2 One of the strings being evaluated for Levenshtein distance. |
| 35 | + * @param integer $cost_ins Defines the cost of insertion. |
| 36 | + * @param integer $cost_rep Defines the cost of replacement. |
| 37 | + * @param integer $cost_del Defines the cost of deletion. |
| 38 | + * |
| 39 | + * @return integer |
| 40 | + */ |
| 41 | +function keinos_mb_levenshtein($s1, $s2, $cost_ins = 1, $cost_rep = 1, $cost_del = 1) |
| 42 | +{ |
| 43 | + $charMap = array(); |
| 44 | + convert_mb_ascii($s1, $charMap); |
| 45 | + convert_mb_ascii($s2, $charMap); |
| 46 | + |
| 47 | + return levenshtein($s1, $s2, $cost_ins, $cost_rep, $cost_del); |
| 48 | +} |
| 49 | + |
| 50 | +/** |
| 51 | + * convert_mb_ascii. |
| 52 | + * |
| 53 | + * Convert an UTF-8 encoded string to a single-byte string suitable for |
| 54 | + * functions such as levenshtein. |
| 55 | + * |
| 56 | + * The function simply uses (and updates) a tailored dynamic encoding |
| 57 | + * (in/out map parameter) where non-ascii characters are remapped to |
| 58 | + * the range [128-255] in order of appearance. |
| 59 | + * |
| 60 | + * Thus it supports up to 128 different multibyte code points max over |
| 61 | + * the whole set of strings sharing this encoding. |
| 62 | + * |
| 63 | + * @param string $str UTF-8 string to be converted to extended ASCII. |
| 64 | + * @param array $map Reference of the map. |
| 65 | + * |
| 66 | + * @return void |
| 67 | + */ |
| 68 | +function convert_mb_ascii(&$str, &$map) |
| 69 | +{ |
| 70 | + // find all utf-8 characters |
| 71 | + $matches = array(); |
| 72 | + if (! preg_match_all('/[\xC0-\xF7][\x80-\xBF]+/', $str, $matches)) { |
| 73 | + return; // plain ascii string |
| 74 | + } |
| 75 | + |
| 76 | + // update the encoding map with the characters not already met |
| 77 | + $mapCount = count($map); |
| 78 | + foreach ($matches[0] as $mbc) { |
| 79 | + if (! isset($map[$mbc])) { |
| 80 | + $map[$mbc] = chr(128 + $mapCount); |
| 81 | + $mapCount++; |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + // finally remap non-ascii characters |
| 86 | + $str = strtr($str, $map); |
| 87 | +} |
| 88 | + |
0 commit comments