diff --git a/lib/elixir/lib/string.ex b/lib/elixir/lib/string.ex index 6de7a20b1f0..0ca14c3c07f 100644 --- a/lib/elixir/lib/string.ex +++ b/lib/elixir/lib/string.ex @@ -3066,74 +3066,13 @@ defmodule String do @spec jaro_distance(t, t) :: float def jaro_distance(string1, string2) - def jaro_distance(string, string), do: 1.0 + def jaro_distance(string, string) when is_binary(string), do: 1.0 def jaro_distance(_string, ""), do: 0.0 def jaro_distance("", _string), do: 0.0 def jaro_distance(string1, string2) when is_binary(string1) and is_binary(string2) do - {chars1, len1} = graphemes_and_length(string1) - {chars2, len2} = graphemes_and_length(string2) - - case match(chars1, len1, chars2, len2) do - {0, _trans} -> - 0.0 - - {comm, trans} -> - (comm / len1 + comm / len2 + (comm - trans) / comm) / 3 - end - end - - defp match(chars1, len1, chars2, len2) do - if len1 < len2 do - match(chars1, chars2, div(len2, 2) - 1) - else - match(chars2, chars1, div(len1, 2) - 1) - end - end - - defp match(chars1, chars2, lim) do - match(chars1, chars2, {0, lim}, {0, 0, -1}, 0) - end - - defp match([char | rest], chars, range, state, idx) do - {chars, state} = submatch(char, chars, range, state, idx) - - case range do - {lim, lim} -> match(rest, tl(chars), range, state, idx + 1) - {pre, lim} -> match(rest, chars, {pre + 1, lim}, state, idx + 1) - end - end - - defp match([], _, _, {comm, trans, _}, _), do: {comm, trans} - - defp submatch(char, chars, {pre, _} = range, state, idx) do - case detect(char, chars, range) do - nil -> - {chars, state} - - {subidx, chars} -> - {chars, proceed(state, idx - pre + subidx)} - end - end - - defp detect(char, chars, {pre, lim}) do - detect(char, chars, pre + 1 + lim, 0, []) - end - - defp detect(_char, _chars, 0, _idx, _acc), do: nil - defp detect(_char, [], _lim, _idx, _acc), do: nil - - defp detect(char, [char | rest], _lim, idx, acc), do: {idx, Enum.reverse(acc, [nil | rest])} - - defp detect(char, [other | rest], lim, idx, acc), - do: detect(char, rest, lim - 1, idx + 1, [other | acc]) - - defp proceed({comm, trans, former}, current) do - if current < former do - {comm + 1, trans + 1, current} - else - {comm + 1, trans, current} - end + # TODO: Replace by :string.jaro_similarity/2 when we require Erlang/OTP 27+ + :elixir_utils.jaro_similarity(string1, string2) end @doc """ @@ -3168,7 +3107,6 @@ defmodule String do codepoint_byte_size: 1, grapheme_byte_size: 1, grapheme_to_binary: 1, - graphemes_and_length: 1, reverse_characters_to_binary: 1} defp byte_size_unicode(binary) when is_binary(binary), do: byte_size(binary) @@ -3205,22 +3143,6 @@ defmodule String do defp grapheme_byte_size([], acc), do: acc - defp graphemes_and_length(string), - do: graphemes_and_length(string, [], 0) - - defp graphemes_and_length(string, acc, length) do - case :unicode_util.gc(string) do - [gc | rest] -> - graphemes_and_length(rest, [gc | acc], length + 1) - - [] -> - {:lists.reverse(acc), length} - - {:error, <>} -> - graphemes_and_length(rest, [<> | acc], length + 1) - end - end - defp reverse_characters_to_binary(acc), do: acc |> :lists.reverse() |> :unicode.characters_to_binary() end diff --git a/lib/elixir/src/elixir_utils.erl b/lib/elixir/src/elixir_utils.erl index 69e80c2a54e..539300f2bea 100644 --- a/lib/elixir/src/elixir_utils.erl +++ b/lib/elixir/src/elixir_utils.erl @@ -8,7 +8,7 @@ read_file_type/1, read_file_type/2, read_link_type/1, read_posix_mtime_and_size/1, change_posix_time/2, change_universal_time/2, guard_op/2, extract_splat_guards/1, extract_guards/1, - erlang_comparison_op_to_elixir/1, erl_fa_to_elixir_fa/2]). + erlang_comparison_op_to_elixir/1, erl_fa_to_elixir_fa/2, jaro_similarity/2]). -include("elixir.hrl"). -include_lib("kernel/include/file.hrl"). @@ -223,3 +223,86 @@ returns_boolean({'__block__', _, Exprs}) -> returns_boolean(lists:last(Exprs)); returns_boolean(_) -> false. + + +% TODO: Remove me when we require Erlang/OTP 27+ +% This is a polyfill for older versions, copying the code from +% https://github.com/erlang/otp/pull/7879 +-spec jaro_similarity(String1, String2) -> Similarity when + String1 :: unicode:chardata(), + String2 :: unicode:chardata(), + Similarity :: float(). %% Between +0.0 and 1.0 +jaro_similarity(A0, B0) -> + {A, ALen} = str_to_gcl_and_length(A0), + {B, BLen} = str_to_indexmap(B0), + Dist = max(ALen, BLen) div 2, + {AM, BM} = jaro_match(A, B, -Dist, Dist, [], []), + if + ALen =:= 0 andalso BLen =:= 0 -> + 1.0; + ALen =:= 0 orelse BLen =:= 0 -> + 0.0; + AM =:= [] -> + 0.0; + true -> + {M,T} = jaro_calc_mt(AM, BM, 0, 0), + (M/ALen + M/BLen + (M-T/2)/M) / 3 + end. + +jaro_match([A|As], B0, Min, Max, AM, BM) -> + case jaro_detect(maps:get(A, B0, []), Min, Max) of + false -> + jaro_match(As, B0, Min+1, Max+1, AM, BM); + {J, Remain} -> + B = B0#{A => Remain}, + jaro_match(As, B, Min+1, Max+1, [A|AM], add_rsorted({J,A},BM)) + end; +jaro_match(_A, _B, _Min, _Max, AM, BM) -> + {AM, BM}. + +jaro_detect([Idx|Rest], Min, Max) when Min < Idx, Idx < Max -> + {Idx, Rest}; +jaro_detect([Idx|Rest], Min, Max) when Idx < Max -> + jaro_detect(Rest, Min, Max); +jaro_detect(_, _, _) -> + false. + +jaro_calc_mt([CharA|AM], [{_, CharA}|BM], M, T) -> + jaro_calc_mt(AM, BM, M+1, T); +jaro_calc_mt([_|AM], [_|BM], M, T) -> + jaro_calc_mt(AM, BM, M+1, T+1); +jaro_calc_mt([], [], M, T) -> + {M, T}. + + +%% Returns GC list and length +str_to_gcl_and_length(S0) -> + gcl_and_length(unicode_util:gc(S0), [], 0). + +gcl_and_length([C|Str], Acc, N) -> + gcl_and_length(unicode_util:gc(Str), [C|Acc], N+1); +gcl_and_length([], Acc, N) -> + {lists:reverse(Acc), N}; +gcl_and_length({error, Err}, _, _) -> + error({badarg, Err}). + +%% Returns GC map with index and length +str_to_indexmap(S) -> + [M|L] = str_to_map(unicode_util:gc(S), 0), + {M,L}. + +str_to_map([], L) -> [#{}|L]; +str_to_map([G | Gs], I) -> + [M|L] = str_to_map(unicode_util:gc(Gs), I+1), + [maps:put(G, [I | maps:get(G, M, [])], M)| L]; +str_to_map({error,Error}, _) -> + error({badarg, Error}). + +%% Add in decreasing order +add_rsorted(A, [H|_]=BM) when A > H -> + [A|BM]; +add_rsorted(A, [H|BM]) -> + [H|add_rsorted(A,BM)]; +add_rsorted(A, []) -> + [A]. + diff --git a/lib/elixir/test/elixir/string_test.exs b/lib/elixir/test/elixir/string_test.exs index 9339aae765c..1980fbffe29 100644 --- a/lib/elixir/test/elixir/string_test.exs +++ b/lib/elixir/test/elixir/string_test.exs @@ -982,7 +982,7 @@ defmodule StringTest do assert String.jaro_distance("marhha", "martha") == 0.888888888888889 assert String.jaro_distance("dwayne", "duane") == 0.8222222222222223 assert String.jaro_distance("dixon", "dicksonx") == 0.7666666666666666 - assert String.jaro_distance("xdicksonx", "dixon") == 0.7851851851851852 + assert String.jaro_distance("xdicksonx", "dixon") == 0.7518518518518519 assert String.jaro_distance("shackleford", "shackelford") == 0.9696969696969697 assert String.jaro_distance("dunningham", "cunnigham") == 0.8962962962962964 assert String.jaro_distance("nichleson", "nichulson") == 0.9259259259259259 @@ -999,6 +999,7 @@ defmodule StringTest do assert String.jaro_distance("jon", "john") == 0.9166666666666666 assert String.jaro_distance("jon", "jan") == 0.7777777777777777 assert String.jaro_distance("семена", "стремя") == 0.6666666666666666 + assert String.jaro_distance("Sunday", "Saturday") == 0.7194444444444444 end test "myers_difference/2" do