File 7311-Add-string-jaro_similarity-2.patch of Package erlang

From 610c1aeb0dbe40ff4218acde73183b075a353412 Mon Sep 17 00:00:00 2001
From: Dan Gudmundsson <dgud@erlang.org>
Date: Thu, 16 Nov 2023 18:41:46 +0100
Subject: [PATCH] Add string:jaro_similarity/2

Calculate word similarity, can for example be used to provide
potential alternatives in error messages.
---
 lib/stdlib/doc/src/string.xml    | 24 ++++++++++
 lib/stdlib/src/string.erl        | 80 +++++++++++++++++++++++++++++++-
 lib/stdlib/test/string_SUITE.erl | 45 ++++++++++++++++--
 3 files changed, 144 insertions(+), 5 deletions(-)

diff --git a/lib/stdlib/doc/src/string.xml b/lib/stdlib/doc/src/string.xml
index 5176f6de60..3c48153045 100644
--- a/lib/stdlib/doc/src/string.xml
+++ b/lib/stdlib/doc/src/string.xml
@@ -244,6 +244,30 @@ true</pre>
       </desc>
     </func>
 
+    <func>
+      <name name="jaro_similarity" arity="2" since="OTP 27.0"/>
+      <fsummary>Calculate the Jaro similarity of two strings.</fsummary>
+      <desc>
+        <p>Returns a float between <c>+0.0</c> and <c>1.0</c> representing the
+        <url href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
+        Jaro similarity</url> between the given strings. Strings with many letters
+        in common relative to their lengths will score closer to <c>1.0</c>.
+        </p>
+        <p>The Jaro distance between two strings can be calculated with <c>JaroDistance = 1.0-JaroSimilarity</c>.
+        </p>
+        <p><em>Example:</em></p>
+        <pre>
+1> <input>string:jaro_similarity("ditto", "ditto").</input>
+1.0
+2> <input>string:jaro_similarity("foo", "bar").</input>
++0.0
+3> <input>string:jaro_similarity("michelle", "michael").</input>
+0.8690476190476191
+4> <input>string:jaro_similarity(&lt;&lt;"Édouard"/utf8>>, &lt;&lt;"Claude">>).</input>
+0.5317460317460317</pre>
+      </desc>
+    </func>
+
     <func>
       <name name="length" arity="1" since="OTP 20.0"/>
       <fsummary>Calculate length of the string.</fsummary>
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index 163bd8d081..02593e3cff 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -58,6 +58,7 @@
          prefix/2,
          split/2,split/3,replace/3,replace/4,
          find/2,find/3,
+         jaro_similarity/2,
          next_codepoint/1, next_grapheme/1
         ]).
 
@@ -85,7 +86,7 @@
 -type grapheme_cluster() :: char() | [char()].
 -type direction() :: 'leading' | 'trailing'.
 
--dialyzer({no_improper_lists, [stack/2, length_b/3]}).
+-dialyzer({no_improper_lists, [stack/2, length_b/3, str_to_map/2]}).
 %%% BIFs internal (not documented) should not to be used outside of this module
 %%% May be removed
 -export([list_to_float/1, list_to_integer/1]).
@@ -563,6 +564,52 @@ find(String, SearchPattern, leading) ->
 find(String, SearchPattern, trailing) ->
     find_r(String, unicode:characters_to_list(SearchPattern), nomatch).
 
+-spec jaro_similarity(String1, String2) -> Similarity when
+      String1 :: unicode:chardata(),
+      String2 :: unicode:chardata(),
+      Similarity :: float(). %% Between +0.0 and 1.0
+jaro_similarity(A0, B0) ->
+    {A, ALen} = str_to_gcl_and_length(A0),
+    {B, BLen} = str_to_indexmap(B0),
+    Dist = max(ALen, BLen) div 2,
+    {AM, BM} = jaro_match(A, B, -Dist, Dist, [], []),
+    if
+        ALen =:= 0 andalso BLen =:= 0 ->
+            1.0;
+        ALen =:= 0 orelse BLen =:= 0 ->
+            0.0;
+        AM =:= [] ->
+            0.0;
+        true ->
+            {M,T} = jaro_calc_mt(AM, BM, 0, 0),
+            (M/ALen + M/BLen + (M-T/2)/M) / 3
+    end.
+
+jaro_match([A|As], B0, Min, Max, AM, BM) ->
+    case jaro_detect(maps:get(A, B0, []), Min, Max) of
+        false ->
+            jaro_match(As, B0, Min+1, Max+1, AM, BM);
+        {J, Remain} ->
+            B = B0#{A => Remain},
+            jaro_match(As, B, Min+1, Max+1, [A|AM], add_rsorted({J,A},BM))
+    end;
+jaro_match(_A, _B, _Min, _Max, AM, BM) ->
+    {AM, BM}.
+
+jaro_detect([Idx|Rest], Min, Max) when Min < Idx, Idx < Max ->
+    {Idx, Rest};
+jaro_detect([Idx|Rest], Min, Max) when Idx < Max ->
+    jaro_detect(Rest, Min, Max);
+jaro_detect(_, _, _) ->
+    false.
+
+jaro_calc_mt([CharA|AM], [{_, CharA}|BM], M, T) ->
+    jaro_calc_mt(AM, BM, M+1, T);
+jaro_calc_mt([_|AM], [_|BM], M, T) ->
+    jaro_calc_mt(AM, BM, M+1, T+1);
+jaro_calc_mt([], [], M, T) ->
+    {M, T}.
+
 %% Fetch first grapheme cluster and return rest in tail
 -spec next_grapheme(String::unicode:chardata()) ->
                            maybe_improper_list(grapheme_cluster(),unicode:chardata()) |
@@ -1795,6 +1842,37 @@ bin_search_str_2(Bin0, Start, Cont, First, SearchCPs) ->
     end.
 
 
+%% Returns GC list and length
+str_to_gcl_and_length(S0) ->
+    gcl_and_length(unicode_util:gc(S0), [], 0).
+
+gcl_and_length([C|Str], Acc, N) ->
+    gcl_and_length(unicode_util:gc(Str), [C|Acc], N+1);
+gcl_and_length([], Acc, N) ->
+    {lists:reverse(Acc), N};
+gcl_and_length({error, Err}, _, _) ->
+    error({badarg, Err}).
+
+%% Returns GC map with index and length
+str_to_indexmap(S) ->
+    [M|L] = str_to_map(unicode_util:gc(S), 0),
+    {M,L}.
+
+str_to_map([], L) -> [#{}|L];
+str_to_map([G | Gs], I) ->
+    [M|L] = str_to_map(unicode_util:gc(Gs), I+1),
+    [maps:put(G, [I | maps:get(G, M, [])], M)| L];
+str_to_map({error,Error}, _) ->
+    error({badarg, Error}).
+
+%% Add in decreasing order
+add_rsorted(A, [H|_]=BM) when A > H ->
+    [A|BM];
+add_rsorted(A, [H|BM]) ->
+    [H|add_rsorted(A,BM)];
+add_rsorted(A, []) ->
+    [A].
+
 %%---------------------------------------------------------------------------
 %% OLD lists API kept for backwards compability
 %%---------------------------------------------------------------------------
diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl
index 5a75c930dd..4e33a4d47b 100644
--- a/lib/stdlib/test/string_SUITE.erl
+++ b/lib/stdlib/test/string_SUITE.erl
@@ -36,7 +36,9 @@
          uppercase/1, lowercase/1, titlecase/1, casefold/1,
          to_integer/1,to_float/1,
          prefix/1, split/1, replace/1, find/1,
-         lexemes/1, nth_lexeme/1, cd_gc/1, meas/1
+         lexemes/1, nth_lexeme/1, cd_gc/1,
+         jaro_similarity/1,
+         meas/1
         ]).
 
 -export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]).
@@ -66,6 +68,7 @@ groups() ->
        to_integer, to_float,
        uppercase, lowercase, titlecase, casefold,
        prefix, find, split, replace, cd_gc,
+       jaro_similarity,
        meas]},
      {list_string,
       [len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan,
@@ -788,6 +791,36 @@ nth_lexeme(_) ->
     ?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"),
     ok.
 
+jaro_similarity(_Config) ->
+    ?TEST("", [""], 1.0),
+    ?TEST("", [["", <<"">>]], 1.0),
+    %% From https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity
+    ?TEST("faremviel", ["farmville"], 0.8842592592592592),
+    ?TEST("michelle", ["michael"], 0.8690476190476191),
+    ?TEST("michelle", [<<"michael">>], 0.8690476190476191),
+    ?TEST(<<"Édouard"/utf8>>, ["Claude"], 0.5317460317460317),
+
+
+    ?TEST("farmville", ["farmville"], 1.0),
+    ?TEST("farmville", ["zxzxzx"], +0.0),
+
+    ?TEST("Saturday", ["Sunday"], 0.71944444),
+    ?TEST("Sunday", ["Saturday"], 0.71944444),
+
+    %% Short strings (no translations counted)
+    ?TEST("ca", ["abc"], 0.0),
+    ?TEST("ca", ["cb"],  ((1/2+1/2+1)/3)),
+    ?TEST("ca", ["cab"], ((2/2+2/3+1)/3)),
+    ?TEST("caa", ["cab"], ((2/3+2/3+1)/3)),
+    %% With one translation
+    ?TEST("caabx", ["caba"], ((4/5+4/4+((4-2/2)/4))/3)),
+
+    InvalidUTF8 = <<192,192>>,
+    {'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", InvalidUTF8)),
+    {'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", <<$a, InvalidUTF8/binary, $z>>)),
+
+    ok.
+
 
 meas(Config) ->
     Parent = self(),
@@ -956,7 +989,7 @@ test_1(Line, Func, Str, Args, Exp) ->
         check_types(Line, Func, Args, Res),
         case res(Res, Exp) of
             true -> ok;
-            {Res1,Exp1} when is_tuple(Exp1) ->
+            {Res1,Exp1} when is_tuple(Exp1); is_float(Exp1) ->
                 io:format("~p~n",[Args]),
                 io:format("~p:~p: ~ts~w =>~n  :~w:~w~n",
                           [Func,Line, Str,Str,Res1,Exp1]),
@@ -999,6 +1032,8 @@ res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take
         {true, true} -> true;
         _ -> {S, E}
     end;
+res(Float, Exp) when is_float(Exp) ->
+    abs(Float - Exp) < 0.0000001 orelse {Float, Exp};
 res(Int, Exp) ->
     Int == Exp orelse {Int, Exp}.
 
@@ -1007,8 +1042,10 @@ check_types(_Line, _Func, _Str, Res)
   when is_integer(Res); is_boolean(Res); Res =:= nomatch ->
     %% length or equal
     ok;
-check_types(Line, Func, [S1,S2], Res)
-  when Func =:= concat ->
+check_types(_Line, jaro_similarity, _Str, Res)
+  when is_float(Res) ->
+    ok;
+check_types(Line, concat = Func, [S1,S2], Res) ->
     case check_types_1(type(S1),type(S2)) of
         ok ->
             case check_types_1(type(S1),type(Res)) of
-- 
2.35.3

openSUSE Build Service is sponsored by