File 2291-Regular-expression-replacement-with-a-function.patch of Package erlang
From 94191a89750f06474c69305c56b11b36ed369999 Mon Sep 17 00:00:00 2001
From: Jan Uhlig <juhlig@hnc-agency.org>
Date: Wed, 24 Aug 2022 12:25:20 +0200
Subject: [PATCH] Regular expression replacement with a function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
With this change, `re:replace/3,4` also accepts a function for the
Replacement argument, for cases when more complex processing is
required to generate a replacement. The given function will be
called with the complete match and a list of subexpression matches
as arguments, and the returned value will be inserted into the result.
Co-authored-by: Maria Scott <maria-12648430@hnc-agency.org>
Co-authored-by: Björn Gustavsson <bgustavsson@gmail.com>
---
lib/stdlib/doc/src/re.xml | 40 ++++++++++++++++++++++++++++++++----
lib/stdlib/src/re.erl | 34 ++++++++++++++++++++++--------
lib/stdlib/test/re_SUITE.erl | 14 +++++++++++--
3 files changed, 74 insertions(+), 14 deletions(-)
diff --git a/lib/stdlib/doc/src/re.xml b/lib/stdlib/doc/src/re.xml
index e16ef12f16..43bdb142f4 100644
--- a/lib/stdlib/doc/src/re.xml
+++ b/lib/stdlib/doc/src/re.xml
@@ -75,6 +75,9 @@
<datatype>
<name name="compile_option"/>
</datatype>
+ <datatype>
+ <name name="replace_fun"/>
+ </datatype>
</datatypes>
<funcs>
@@ -363,7 +366,7 @@
elements with Replacement.</fsummary>
<desc>
<p>Replaces the matched part of the <c><anno>Subject</anno></c> string
- with the contents of <c><anno>Replacement</anno></c>.</p>
+ with <c><anno>Replacement</anno></c>.</p>
<p>The permissible options are the same as for
<seemfa marker="#run/3"><c>run/3</c></seemfa>, except that option<c>
capture</c> is not allowed. Instead a <c>{return,
@@ -378,8 +381,8 @@
<c>unicode</c> compilation option is specified to this function, both
the regular expression and <c><anno>Subject</anno></c> are to
specified as valid Unicode <c>charlist()</c>s.</p>
- <p>The replacement string can contain the special character
- <c>&</c>, which inserts the whole matching expression in the
+ <p>If the replacement is given as a string, it can contain the special
+ character <c>&</c>, which inserts the whole matching expression in the
result, and the special sequence <c>\</c>N (where N is an integer >
0), <c>\g</c>N, or <c>\g{</c>N<c>}</c>, resulting in the subexpression
number N, is inserted in the result. If no subexpression with that
@@ -401,6 +404,35 @@ re:replace("abcd","c","[\\&]",[{return,list}]).</code>
<p>gives</p>
<code>
"ab[&]d"</code>
+ <p>If the replacement is given as a fun, it will be called with the
+ whole matching expression as the first argument and a list of subexpression
+ matches in the order in which they appear in the regular expression.
+ The returned value will be inserted in the result.</p>
+ <p><em>Example:</em></p>
+ <code>
+re:replace("abcd", ".(.)", fun(Whole, [<<C>>]) -> <<$#, Whole/binary, $-, (C - $a + $A), $#>> end, [{return, list}]).</code>
+ <p>gives</p>
+ <code>
+"#ab-B#cd"</code>
+ <note>
+ <p>Non-matching optional subexpressions will not be included in the list
+ of subexpression matches if they are the last subexpressions in the
+ regular expression.</p>
+ <p><em>Example:</em></p>
+ <p>The regular expression <c>"(a)(b)?(c)?"</c> ("a", optionally followed
+ by "b", optionally followed by "c") will create the following subexpression
+ lists:</p>
+ <list>
+ <item><c>[<<"a">>, <<"b">>, <<"c">>]</c>
+ when applied to the string <c>"abc"</c></item>
+ <item><c>[<<"a">>, <<>>, <<"c">>]</c>
+ when applied to the string <c>"acx"</c></item>
+ <item><c>[<<"a">>, <<"b">>]</c>
+ when applied to the string <c>"abx"</c></item>
+ <item><c>[<<"a">>]</c>
+ when applied to the string <c>"axx"</c></item>
+ </list>
+ </note>
<p>As with <c>run/3</c>, compilation errors raise the <c>badarg</c>
exception. <seemfa marker="#compile/2"><c>compile/2</c></seemfa>
can be used to get more information about the error.</p>
@@ -972,7 +1004,7 @@ re:run("ABCabcdABC",".*(?<FOO>abcd).*",[{capture,['FOO']}]).</code>
<p>Here the empty binary (<c><<>></c>) represents the
unassigned subpattern. In the <c>binary</c> case, some information
about the matching is therefore lost, as
- <c><<>></c> can
+ <c><<>></c> can
also be an empty string captured.</p>
<p>If differentiation between empty matches and non-existing
subpatterns is necessary, use the <c>type</c> <c>index</c> and do
diff --git a/lib/stdlib/src/re.erl b/lib/stdlib/src/re.erl
index 863bbeb652..3a3eca8f44 100644
--- a/lib/stdlib/src/re.erl
+++ b/lib/stdlib/src/re.erl
@@ -31,6 +31,8 @@
| bsr_anycrlf | bsr_unicode
| no_start_optimize | ucp | never_utf.
+-type replace_fun() :: fun((binary(), [binary()]) -> iodata() | unicode:charlist()).
+
%%% BIFs
-export([internal_run/4]).
@@ -353,7 +355,7 @@ compile_split(_,_) ->
-spec replace(Subject, RE, Replacement) -> iodata() | unicode:charlist() when
Subject :: iodata() | unicode:charlist(),
RE :: mp() | iodata(),
- Replacement :: iodata() | unicode:charlist().
+ Replacement :: iodata() | unicode:charlist() | replace_fun().
replace(Subject,RE,Replacement) ->
try
@@ -366,7 +368,7 @@ replace(Subject,RE,Replacement) ->
-spec replace(Subject, RE, Replacement, Options) -> iodata() | unicode:charlist() when
Subject :: iodata() | unicode:charlist(),
RE :: mp() | iodata() | unicode:charlist(),
- Replacement :: iodata() | unicode:charlist(),
+ Replacement :: iodata() | unicode:charlist() | replace_fun(),
Options :: [Option],
Option :: anchored | global | notbol | noteol | notempty
| notempty_atstart
@@ -380,11 +382,11 @@ replace(Subject,RE,Replacement) ->
replace(Subject,RE,Replacement,Options) ->
try
- {NewOpt,Convert} = process_repl_params(Options,iodata),
- Unicode = check_for_unicode(RE, Options),
- FlatSubject = to_binary(Subject, Unicode),
- FlatReplacement = to_binary(Replacement, Unicode),
- IoList = do_replace(FlatSubject,Subject,RE,FlatReplacement,NewOpt),
+ {NewOpt,Convert} = process_repl_params(Options,iodata),
+ Unicode = check_for_unicode(RE, Options),
+ FlatSubject = to_binary(Subject, Unicode),
+ Replacement1 = normalize_replacement(Replacement, Unicode),
+ IoList = do_replace(FlatSubject,Subject,RE,Replacement1,NewOpt),
case Convert of
iodata ->
IoList;
@@ -412,6 +414,10 @@ replace(Subject,RE,Replacement,Options) ->
badarg_with_info([Subject,RE,Replacement,Options])
end.
+normalize_replacement(Replacement, _Unicode) when is_function(Replacement, 2) ->
+ Replacement;
+normalize_replacement(Replacement, Unicode) ->
+ to_binary(Replacement, Unicode).
do_replace(FlatSubject,Subject,RE,Replacement,Options) ->
case re:run(FlatSubject,RE,Options) of
@@ -512,7 +518,9 @@ precomp_repl(<<X,Rest/binary>>) ->
[<<X,BHead/binary>> | T0];
Other ->
[<<X>> | Other]
- end.
+ end;
+precomp_repl(Repl) when is_function(Repl) ->
+ Repl.
@@ -540,6 +548,16 @@ do_mlist(Whole,Subject,Pos,Repl,[[{MPos,Count} | Sub] | Tail])
[NewData | do_mlist(Whole,Rest,Pos+EatLength,Repl,Tail)].
+do_replace(Subject, Repl, SubExprs0) when is_function(Repl) ->
+ All = binary:part(Subject, hd(SubExprs0)),
+ SubExprs1 =
+ [ if
+ Pos >= 0, Len > 0 ->
+ binary:part(Subject, Pos, Len);
+ true ->
+ <<>>
+ end || {Pos, Len} <- tl(SubExprs0) ],
+ Repl(All, SubExprs1);
do_replace(_,[Bin],_) when is_binary(Bin) ->
Bin;
do_replace(Subject,Repl,SubExprs0) ->
diff --git a/lib/stdlib/test/re_SUITE.erl b/lib/stdlib/test/re_SUITE.erl
index 09a65d8fdd..fc6e977942 100644
--- a/lib/stdlib/test/re_SUITE.erl
+++ b/lib/stdlib/test/re_SUITE.erl
@@ -22,7 +22,7 @@
-export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1,
init_per_group/2,end_per_group/2, pcre/1,compile_options/1,
run_options/1,combined_options/1,replace_autogen/1,
- global_capture/1,replace_input_types/1,replace_return/1,
+ global_capture/1,replace_input_types/1,replace_with_fun/1,replace_return/1,
split_autogen/1,split_options/1,split_specials/1,
error_handling/1,pcre_cve_2008_2371/1,re_version/1,
pcre_compile_workspace_overflow/1,re_infinite_loop/1,
@@ -42,7 +42,7 @@ suite() ->
all() ->
[pcre, compile_options, run_options, combined_options,
replace_autogen, global_capture, replace_input_types,
- replace_return, split_autogen, split_options,
+ replace_with_fun, replace_return, split_autogen, split_options,
split_specials, error_handling, pcre_cve_2008_2371,
pcre_compile_workspace_overflow, re_infinite_loop,
re_backwards_accented, opt_dupnames, opt_all_names,
@@ -365,6 +365,16 @@ replace_input_types(Config) when is_list(Config) ->
<<"a",208,128,"cd">> = re:replace(<<"abcd">>,"b","\x{400}",[{return,binary},unicode]),
ok.
+%% Test replace with a replacement function.
+replace_with_fun(Config) when is_list(Config) ->
+ <<"ABCD">> = re:replace("abcd", ".", fun(<<C>>, []) -> <<(C - $a + $A)>> end, [global, {return, binary}]),
+ <<"AbCd">> = re:replace("abcd", ".", fun(<<C>>, []) when (C - $a) rem 2 =:= 0 -> <<(C - $a + $A)>>; (C, []) -> C end, [global, {return, binary}]),
+ <<"b-ad-c">> = re:replace("abcd", "(.)(.)", fun(_, [A, B]) -> <<B/binary, $-, A/binary>> end, [global, {return, binary}]),
+ <<"#ab-B#cd">> = re:replace("abcd", ".(.)", fun(Whole, [<<C>>]) -> <<$#, Whole/binary, $-, (C - $a + $A), $#>> end, [{return, binary}]),
+ <<"#ab#cd">> = re:replace("abcd", ".(x)?(.)", fun(Whole, [<<>>, _]) -> <<$#, Whole/binary, $#>> end, [{return, binary}]),
+ <<"#ab#cd">> = re:replace("abcd", ".(.)(x)?", fun(Whole, [_]) -> <<$#, Whole/binary, $#>> end, [{return, binary}]),
+ ok.
+
%% Test return options of replace together with global searching.
replace_return(Config) when is_list(Config) ->
{'EXIT',{badarg,_}} = (catch re:replace("na","(a","")),
--
2.35.3