File 7311-Add-string-jaro_similarity-2.patch of Package erlang
From 610c1aeb0dbe40ff4218acde73183b075a353412 Mon Sep 17 00:00:00 2001
From: Dan Gudmundsson <dgud@erlang.org>
Date: Thu, 16 Nov 2023 18:41:46 +0100
Subject: [PATCH] Add string:jaro_similarity/2
Calculate word similarity, can for example be used to provide
potential alternatives in error messages.
---
lib/stdlib/doc/src/string.xml | 24 ++++++++++
lib/stdlib/src/string.erl | 80 +++++++++++++++++++++++++++++++-
lib/stdlib/test/string_SUITE.erl | 45 ++++++++++++++++--
3 files changed, 144 insertions(+), 5 deletions(-)
diff --git a/lib/stdlib/doc/src/string.xml b/lib/stdlib/doc/src/string.xml
index 5176f6de60..3c48153045 100644
--- a/lib/stdlib/doc/src/string.xml
+++ b/lib/stdlib/doc/src/string.xml
@@ -244,6 +244,30 @@ true</pre>
</desc>
</func>
+ <func>
+ <name name="jaro_similarity" arity="2" since="OTP 27.0"/>
+ <fsummary>Calculate the Jaro similarity of two strings.</fsummary>
+ <desc>
+ <p>Returns a float between <c>+0.0</c> and <c>1.0</c> representing the
+ <url href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
+ Jaro similarity</url> between the given strings. Strings with many letters
+ in common relative to their lengths will score closer to <c>1.0</c>.
+ </p>
+ <p>The Jaro distance between two strings can be calculated with <c>JaroDistance = 1.0-JaroSimilarity</c>.
+ </p>
+ <p><em>Example:</em></p>
+ <pre>
+1> <input>string:jaro_similarity("ditto", "ditto").</input>
+1.0
+2> <input>string:jaro_similarity("foo", "bar").</input>
++0.0
+3> <input>string:jaro_similarity("michelle", "michael").</input>
+0.8690476190476191
+4> <input>string:jaro_similarity(<<"Édouard"/utf8>>, <<"Claude">>).</input>
+0.5317460317460317</pre>
+ </desc>
+ </func>
+
<func>
<name name="length" arity="1" since="OTP 20.0"/>
<fsummary>Calculate length of the string.</fsummary>
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index 163bd8d081..02593e3cff 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -58,6 +58,7 @@
prefix/2,
split/2,split/3,replace/3,replace/4,
find/2,find/3,
+ jaro_similarity/2,
next_codepoint/1, next_grapheme/1
]).
@@ -85,7 +86,7 @@
-type grapheme_cluster() :: char() | [char()].
-type direction() :: 'leading' | 'trailing'.
--dialyzer({no_improper_lists, [stack/2, length_b/3]}).
+-dialyzer({no_improper_lists, [stack/2, length_b/3, str_to_map/2]}).
%%% BIFs internal (not documented) should not to be used outside of this module
%%% May be removed
-export([list_to_float/1, list_to_integer/1]).
@@ -563,6 +564,52 @@ find(String, SearchPattern, leading) ->
find(String, SearchPattern, trailing) ->
find_r(String, unicode:characters_to_list(SearchPattern), nomatch).
+-spec jaro_similarity(String1, String2) -> Similarity when
+ String1 :: unicode:chardata(),
+ String2 :: unicode:chardata(),
+ Similarity :: float(). %% Between +0.0 and 1.0
+jaro_similarity(A0, B0) ->
+ {A, ALen} = str_to_gcl_and_length(A0),
+ {B, BLen} = str_to_indexmap(B0),
+ Dist = max(ALen, BLen) div 2,
+ {AM, BM} = jaro_match(A, B, -Dist, Dist, [], []),
+ if
+ ALen =:= 0 andalso BLen =:= 0 ->
+ 1.0;
+ ALen =:= 0 orelse BLen =:= 0 ->
+ 0.0;
+ AM =:= [] ->
+ 0.0;
+ true ->
+ {M,T} = jaro_calc_mt(AM, BM, 0, 0),
+ (M/ALen + M/BLen + (M-T/2)/M) / 3
+ end.
+
+jaro_match([A|As], B0, Min, Max, AM, BM) ->
+ case jaro_detect(maps:get(A, B0, []), Min, Max) of
+ false ->
+ jaro_match(As, B0, Min+1, Max+1, AM, BM);
+ {J, Remain} ->
+ B = B0#{A => Remain},
+ jaro_match(As, B, Min+1, Max+1, [A|AM], add_rsorted({J,A},BM))
+ end;
+jaro_match(_A, _B, _Min, _Max, AM, BM) ->
+ {AM, BM}.
+
+jaro_detect([Idx|Rest], Min, Max) when Min < Idx, Idx < Max ->
+ {Idx, Rest};
+jaro_detect([Idx|Rest], Min, Max) when Idx < Max ->
+ jaro_detect(Rest, Min, Max);
+jaro_detect(_, _, _) ->
+ false.
+
+jaro_calc_mt([CharA|AM], [{_, CharA}|BM], M, T) ->
+ jaro_calc_mt(AM, BM, M+1, T);
+jaro_calc_mt([_|AM], [_|BM], M, T) ->
+ jaro_calc_mt(AM, BM, M+1, T+1);
+jaro_calc_mt([], [], M, T) ->
+ {M, T}.
+
%% Fetch first grapheme cluster and return rest in tail
-spec next_grapheme(String::unicode:chardata()) ->
maybe_improper_list(grapheme_cluster(),unicode:chardata()) |
@@ -1795,6 +1842,37 @@ bin_search_str_2(Bin0, Start, Cont, First, SearchCPs) ->
end.
+%% Returns GC list and length
+str_to_gcl_and_length(S0) ->
+ gcl_and_length(unicode_util:gc(S0), [], 0).
+
+gcl_and_length([C|Str], Acc, N) ->
+ gcl_and_length(unicode_util:gc(Str), [C|Acc], N+1);
+gcl_and_length([], Acc, N) ->
+ {lists:reverse(Acc), N};
+gcl_and_length({error, Err}, _, _) ->
+ error({badarg, Err}).
+
+%% Returns GC map with index and length
+str_to_indexmap(S) ->
+ [M|L] = str_to_map(unicode_util:gc(S), 0),
+ {M,L}.
+
+str_to_map([], L) -> [#{}|L];
+str_to_map([G | Gs], I) ->
+ [M|L] = str_to_map(unicode_util:gc(Gs), I+1),
+ [maps:put(G, [I | maps:get(G, M, [])], M)| L];
+str_to_map({error,Error}, _) ->
+ error({badarg, Error}).
+
+%% Add in decreasing order
+add_rsorted(A, [H|_]=BM) when A > H ->
+ [A|BM];
+add_rsorted(A, [H|BM]) ->
+ [H|add_rsorted(A,BM)];
+add_rsorted(A, []) ->
+ [A].
+
%%---------------------------------------------------------------------------
%% OLD lists API kept for backwards compability
%%---------------------------------------------------------------------------
diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl
index 5a75c930dd..4e33a4d47b 100644
--- a/lib/stdlib/test/string_SUITE.erl
+++ b/lib/stdlib/test/string_SUITE.erl
@@ -36,7 +36,9 @@
uppercase/1, lowercase/1, titlecase/1, casefold/1,
to_integer/1,to_float/1,
prefix/1, split/1, replace/1, find/1,
- lexemes/1, nth_lexeme/1, cd_gc/1, meas/1
+ lexemes/1, nth_lexeme/1, cd_gc/1,
+ jaro_similarity/1,
+ meas/1
]).
-export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]).
@@ -66,6 +68,7 @@ groups() ->
to_integer, to_float,
uppercase, lowercase, titlecase, casefold,
prefix, find, split, replace, cd_gc,
+ jaro_similarity,
meas]},
{list_string,
[len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan,
@@ -788,6 +791,36 @@ nth_lexeme(_) ->
?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"),
ok.
+jaro_similarity(_Config) ->
+ ?TEST("", [""], 1.0),
+ ?TEST("", [["", <<"">>]], 1.0),
+ %% From https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity
+ ?TEST("faremviel", ["farmville"], 0.8842592592592592),
+ ?TEST("michelle", ["michael"], 0.8690476190476191),
+ ?TEST("michelle", [<<"michael">>], 0.8690476190476191),
+ ?TEST(<<"Édouard"/utf8>>, ["Claude"], 0.5317460317460317),
+
+
+ ?TEST("farmville", ["farmville"], 1.0),
+ ?TEST("farmville", ["zxzxzx"], +0.0),
+
+ ?TEST("Saturday", ["Sunday"], 0.71944444),
+ ?TEST("Sunday", ["Saturday"], 0.71944444),
+
+ %% Short strings (no translations counted)
+ ?TEST("ca", ["abc"], 0.0),
+ ?TEST("ca", ["cb"], ((1/2+1/2+1)/3)),
+ ?TEST("ca", ["cab"], ((2/2+2/3+1)/3)),
+ ?TEST("caa", ["cab"], ((2/3+2/3+1)/3)),
+ %% With one translation
+ ?TEST("caabx", ["caba"], ((4/5+4/4+((4-2/2)/4))/3)),
+
+ InvalidUTF8 = <<192,192>>,
+ {'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", InvalidUTF8)),
+ {'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", <<$a, InvalidUTF8/binary, $z>>)),
+
+ ok.
+
meas(Config) ->
Parent = self(),
@@ -956,7 +989,7 @@ test_1(Line, Func, Str, Args, Exp) ->
check_types(Line, Func, Args, Res),
case res(Res, Exp) of
true -> ok;
- {Res1,Exp1} when is_tuple(Exp1) ->
+ {Res1,Exp1} when is_tuple(Exp1); is_float(Exp1) ->
io:format("~p~n",[Args]),
io:format("~p:~p: ~ts~w =>~n :~w:~w~n",
[Func,Line, Str,Str,Res1,Exp1]),
@@ -999,6 +1032,8 @@ res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take
{true, true} -> true;
_ -> {S, E}
end;
+res(Float, Exp) when is_float(Exp) ->
+ abs(Float - Exp) < 0.0000001 orelse {Float, Exp};
res(Int, Exp) ->
Int == Exp orelse {Int, Exp}.
@@ -1007,8 +1042,10 @@ check_types(_Line, _Func, _Str, Res)
when is_integer(Res); is_boolean(Res); Res =:= nomatch ->
%% length or equal
ok;
-check_types(Line, Func, [S1,S2], Res)
- when Func =:= concat ->
+check_types(_Line, jaro_similarity, _Str, Res)
+ when is_float(Res) ->
+ ok;
+check_types(Line, concat = Func, [S1,S2], Res) ->
case check_types_1(type(S1),type(S2)) of
ok ->
case check_types_1(type(S1),type(Res)) of
--
2.35.3