File 2291-Regular-expression-replacement-with-a-function.patch of Package erlang

From 94191a89750f06474c69305c56b11b36ed369999 Mon Sep 17 00:00:00 2001
From: Jan Uhlig <juhlig@hnc-agency.org>
Date: Wed, 24 Aug 2022 12:25:20 +0200
Subject: [PATCH] Regular expression replacement with a function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this change, `re:replace/3,4` also accepts a function for the
Replacement argument, for cases when more complex processing is
required to generate a replacement. The given function will be
called with the complete match and a list of subexpression matches
as arguments, and the returned value will be inserted into the result.

Co-authored-by: Maria Scott <maria-12648430@hnc-agency.org>
Co-authored-by: Björn Gustavsson <bgustavsson@gmail.com>
---
 lib/stdlib/doc/src/re.xml    | 40 ++++++++++++++++++++++++++++++++----
 lib/stdlib/src/re.erl        | 34 ++++++++++++++++++++++--------
 lib/stdlib/test/re_SUITE.erl | 14 +++++++++++--
 3 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/lib/stdlib/doc/src/re.xml b/lib/stdlib/doc/src/re.xml
index e16ef12f16..43bdb142f4 100644
--- a/lib/stdlib/doc/src/re.xml
+++ b/lib/stdlib/doc/src/re.xml
@@ -75,6 +75,9 @@
     <datatype>
       <name name="compile_option"/>
     </datatype>
+    <datatype>
+      <name name="replace_fun"/>
+    </datatype>
   </datatypes>
 
   <funcs>
@@ -363,7 +366,7 @@
         elements with Replacement.</fsummary>
       <desc>
         <p>Replaces the matched part of the <c><anno>Subject</anno></c> string
-          with the contents of <c><anno>Replacement</anno></c>.</p>
+          with <c><anno>Replacement</anno></c>.</p>
         <p>The permissible options are the same as for
           <seemfa marker="#run/3"><c>run/3</c></seemfa>, except that option<c>
           capture</c> is not allowed. Instead a <c>{return,
@@ -378,8 +381,8 @@
           <c>unicode</c> compilation option is specified to this function, both
           the regular expression and <c><anno>Subject</anno></c> are to
           specified as valid Unicode <c>charlist()</c>s.</p>
-        <p>The replacement string can contain the special character
-          <c>&amp;</c>, which inserts the whole matching expression in the
+        <p>If the replacement is given as a string, it  can contain the special
+          character <c>&amp;</c>, which inserts the whole matching expression in the
           result, and the special sequence <c>\</c>N (where N is an integer &gt;
           0), <c>\g</c>N, or <c>\g{</c>N<c>}</c>, resulting in the subexpression
           number N, is inserted in the result. If no subexpression with that
@@ -401,6 +404,35 @@ re:replace("abcd","c","[\\&amp;]",[{return,list}]).</code>
         <p>gives</p>
         <code>
 "ab[&amp;]d"</code>
+        <p>If the replacement is given as a fun, it will be called with the
+          whole matching expression as the first argument and a list of subexpression
+          matches in the order in which they appear in the regular expression.
+          The returned value will be inserted in the result.</p>
+        <p><em>Example:</em></p>
+        <code>
+re:replace("abcd", ".(.)", fun(Whole, [&lt;&lt;C&gt;&gt;]) -> &lt;&lt;$#, Whole/binary, $-, (C - $a + $A), $#&gt;&gt; end, [{return, list}]).</code>
+        <p>gives</p>
+        <code>
+"#ab-B#cd"</code>
+        <note>
+          <p>Non-matching optional subexpressions will not be included in the list
+            of subexpression matches if they are the last subexpressions in the
+            regular expression.</p>
+          <p><em>Example:</em></p>
+          <p>The regular expression <c>"(a)(b)?(c)?"</c> ("a", optionally followed
+            by "b", optionally followed by "c") will create the following subexpression
+            lists:</p>
+          <list>
+            <item><c>[&lt;&lt;"a"&gt;&gt;, &lt;&lt;"b"&gt;&gt;, &lt;&lt;"c"&gt;&gt;]</c>
+              when applied to the string <c>"abc"</c></item>
+            <item><c>[&lt;&lt;"a"&gt;&gt;, &lt;&lt;&gt;&gt;, &lt;&lt;"c"&gt;&gt;]</c>
+              when applied to the string <c>"acx"</c></item>
+            <item><c>[&lt;&lt;"a"&gt;&gt;, &lt;&lt;"b"&gt;&gt;]</c>
+              when applied to the string <c>"abx"</c></item>
+            <item><c>[&lt;&lt;"a"&gt;&gt;]</c>
+              when applied to the string <c>"axx"</c></item>
+          </list>
+        </note>
         <p>As with <c>run/3</c>, compilation errors raise the <c>badarg</c>
           exception. <seemfa marker="#compile/2"><c>compile/2</c></seemfa>
           can be used to get more information about the error.</p>
@@ -972,7 +1004,7 @@ re:run("ABCabcdABC",".*(?&lt;FOO&gt;abcd).*",[{capture,['FOO']}]).</code>
             <p>Here the empty binary (<c>&lt;&lt;&gt;&gt;</c>) represents the
               unassigned subpattern. In the <c>binary</c> case, some information
               about the matching is therefore lost, as
-	      <c>&lt;&lt;&gt;&gt;</c> can
+              <c>&lt;&lt;&gt;&gt;</c> can
               also be an empty string captured.</p>
             <p>If differentiation between empty matches and non-existing
               subpatterns is necessary, use the <c>type</c> <c>index</c> and do
diff --git a/lib/stdlib/src/re.erl b/lib/stdlib/src/re.erl
index 863bbeb652..3a3eca8f44 100644
--- a/lib/stdlib/src/re.erl
+++ b/lib/stdlib/src/re.erl
@@ -31,6 +31,8 @@
                         | bsr_anycrlf | bsr_unicode
                         | no_start_optimize | ucp | never_utf.
 
+-type replace_fun() :: fun((binary(), [binary()]) -> iodata() | unicode:charlist()).
+
 %%% BIFs
 
 -export([internal_run/4]).
@@ -353,7 +355,7 @@ compile_split(_,_) ->
 -spec replace(Subject, RE, Replacement) -> iodata() | unicode:charlist() when
       Subject :: iodata() | unicode:charlist(),
       RE :: mp() | iodata(),
-      Replacement :: iodata() | unicode:charlist().
+      Replacement :: iodata() | unicode:charlist() | replace_fun().
 
 replace(Subject,RE,Replacement) ->
     try
@@ -366,7 +368,7 @@ replace(Subject,RE,Replacement) ->
 -spec replace(Subject, RE, Replacement, Options) -> iodata() | unicode:charlist() when
       Subject :: iodata() | unicode:charlist(),
       RE :: mp() | iodata() | unicode:charlist(),
-      Replacement :: iodata() | unicode:charlist(),
+      Replacement :: iodata() | unicode:charlist() | replace_fun(),
       Options :: [Option],
       Option :: anchored | global | notbol | noteol | notempty 
 	      | notempty_atstart
@@ -380,11 +382,11 @@ replace(Subject,RE,Replacement) ->
 
 replace(Subject,RE,Replacement,Options) ->
     try
-    {NewOpt,Convert} = process_repl_params(Options,iodata),
-    Unicode = check_for_unicode(RE, Options),
-    FlatSubject = to_binary(Subject, Unicode),
-    FlatReplacement = to_binary(Replacement, Unicode),
-    IoList = do_replace(FlatSubject,Subject,RE,FlatReplacement,NewOpt),
+	{NewOpt,Convert} = process_repl_params(Options,iodata),
+	Unicode = check_for_unicode(RE, Options),
+	FlatSubject = to_binary(Subject, Unicode),
+	Replacement1 = normalize_replacement(Replacement, Unicode),
+	IoList = do_replace(FlatSubject,Subject,RE,Replacement1,NewOpt),
 	case Convert of
 	    iodata ->
 		IoList;
@@ -412,6 +414,10 @@ replace(Subject,RE,Replacement,Options) ->
 	    badarg_with_info([Subject,RE,Replacement,Options])
     end.
 
+normalize_replacement(Replacement, _Unicode) when is_function(Replacement, 2) ->
+    Replacement;
+normalize_replacement(Replacement, Unicode) ->
+    to_binary(Replacement, Unicode).
 
 do_replace(FlatSubject,Subject,RE,Replacement,Options) ->
     case re:run(FlatSubject,RE,Options) of
@@ -512,7 +518,9 @@ precomp_repl(<<X,Rest/binary>>) ->
 	    [<<X,BHead/binary>> | T0];
 	Other ->
 	    [<<X>> | Other]
-    end.
+    end;
+precomp_repl(Repl) when is_function(Repl) ->
+    Repl.
     
 
 
@@ -540,6 +548,16 @@ do_mlist(Whole,Subject,Pos,Repl,[[{MPos,Count} | Sub] | Tail])
     [NewData | do_mlist(Whole,Rest,Pos+EatLength,Repl,Tail)].
 
 
+do_replace(Subject, Repl, SubExprs0) when is_function(Repl) ->
+    All = binary:part(Subject, hd(SubExprs0)),
+    SubExprs1 =
+        [ if
+              Pos >= 0, Len > 0 ->
+                  binary:part(Subject, Pos, Len);
+              true ->
+                  <<>>
+          end || {Pos, Len} <- tl(SubExprs0) ],
+    Repl(All, SubExprs1);
 do_replace(_,[Bin],_) when is_binary(Bin) ->
     Bin;
 do_replace(Subject,Repl,SubExprs0) ->
diff --git a/lib/stdlib/test/re_SUITE.erl b/lib/stdlib/test/re_SUITE.erl
index 09a65d8fdd..fc6e977942 100644
--- a/lib/stdlib/test/re_SUITE.erl
+++ b/lib/stdlib/test/re_SUITE.erl
@@ -22,7 +22,7 @@
 -export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1, 
 	 init_per_group/2,end_per_group/2, pcre/1,compile_options/1,
 	 run_options/1,combined_options/1,replace_autogen/1,
-	 global_capture/1,replace_input_types/1,replace_return/1,
+	 global_capture/1,replace_input_types/1,replace_with_fun/1,replace_return/1,
 	 split_autogen/1,split_options/1,split_specials/1,
 	 error_handling/1,pcre_cve_2008_2371/1,re_version/1,
 	 pcre_compile_workspace_overflow/1,re_infinite_loop/1, 
@@ -42,7 +42,7 @@ suite() ->
 all() -> 
     [pcre, compile_options, run_options, combined_options,
      replace_autogen, global_capture, replace_input_types,
-     replace_return, split_autogen, split_options,
+     replace_with_fun, replace_return, split_autogen, split_options,
      split_specials, error_handling, pcre_cve_2008_2371,
      pcre_compile_workspace_overflow, re_infinite_loop, 
      re_backwards_accented, opt_dupnames, opt_all_names, 
@@ -365,6 +365,16 @@ replace_input_types(Config) when is_list(Config) ->
     <<"a",208,128,"cd">> = re:replace(<<"abcd">>,"b","\x{400}",[{return,binary},unicode]),
     ok.
 
+%% Test replace with a replacement function.
+replace_with_fun(Config) when is_list(Config) ->
+    <<"ABCD">> = re:replace("abcd", ".", fun(<<C>>, []) -> <<(C - $a + $A)>> end, [global, {return, binary}]),
+    <<"AbCd">> = re:replace("abcd", ".", fun(<<C>>, []) when (C - $a) rem 2 =:= 0 -> <<(C - $a + $A)>>; (C, []) -> C end, [global, {return, binary}]),
+    <<"b-ad-c">> = re:replace("abcd", "(.)(.)", fun(_, [A, B]) -> <<B/binary, $-, A/binary>> end, [global, {return, binary}]),
+    <<"#ab-B#cd">> = re:replace("abcd", ".(.)", fun(Whole, [<<C>>]) -> <<$#, Whole/binary, $-, (C - $a + $A), $#>> end, [{return, binary}]),
+    <<"#ab#cd">> = re:replace("abcd", ".(x)?(.)", fun(Whole, [<<>>, _]) -> <<$#, Whole/binary, $#>> end, [{return, binary}]),
+    <<"#ab#cd">> = re:replace("abcd", ".(.)(x)?", fun(Whole, [_]) -> <<$#, Whole/binary, $#>> end, [{return, binary}]),
+    ok.
+
 %% Test return options of replace together with global searching.
 replace_return(Config) when is_list(Config) ->
     {'EXIT',{badarg,_}} = (catch re:replace("na","(a","")),
-- 
2.35.3

openSUSE Build Service is sponsored by