File 2833-Improve-guards-and-bad-list-input.patch of Package erlang
From f44ea5cf1e902c1d1e317c0d12b21054b5b2cd69 Mon Sep 17 00:00:00 2001
From: Dan Gudmundsson <dgud@erlang.org>
Date: Thu, 16 Jun 2022 16:26:15 +0200
Subject: [PATCH 3/3] Improve guards and bad list input
Do not return bad codepoints such as -1.
Improve the guards and check that the code make errors for bad input
in list strings.
---
lib/stdlib/test/unicode_util_SUITE.erl | 20 ++++++++++++++++-
lib/stdlib/uc_spec/gen_unicode_mod.escript | 26 ++++++++++++----------
2 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/lib/stdlib/test/unicode_util_SUITE.erl b/lib/stdlib/test/unicode_util_SUITE.erl
index ff9a1134d7..7be6baa548 100644
--- a/lib/stdlib/test/unicode_util_SUITE.erl
+++ b/lib/stdlib/test/unicode_util_SUITE.erl
@@ -92,7 +92,7 @@ casefold(_) ->
whitespace(_) ->
WS = unicode_util:whitespace(),
WS = lists:filter(fun unicode_util:is_whitespace/1, WS),
- %% TODO add more tests
+ false = unicode_util:is_whitespace($A),
ok.
cp(_) ->
@@ -103,6 +103,15 @@ cp(_) ->
"hejsan" = fetch(["hej"|<<"san">>], Get),
{error, <<128>>} = Get(<<128>>),
{error, [<<128>>, 0]} = Get([<<128>>, 0]),
+
+ {'EXIT', _} = catch Get([-1]),
+ {'EXIT', _} = catch Get([-1, $a]),
+ {'EXIT', _} = catch Get([foo, $a]),
+ {'EXIT', _} = catch Get([-1, $a]),
+ {'EXIT', _} = catch Get([[], -1]),
+ {'EXIT', _} = catch Get([[-1], $a]),
+ {'EXIT', _} = catch Get([[-1, $a], $a]),
+
ok.
gc(Config) ->
@@ -115,6 +124,15 @@ gc(Config) ->
{error, <<128>>} = Get(<<128>>),
{error, [<<128>>, 0]} = Get([<<128>>, 0]),
+ {'EXIT', _} = catch Get([-1]),
+ {'EXIT', _} = catch Get([-1, $a]),
+ {'EXIT', _} = catch Get([foo, $a]),
+ {'EXIT', _} = catch Get([-1, $a]),
+ {'EXIT', _} = catch Get([[], -1]),
+ {'EXIT', _} = catch Get([[-1], $a]),
+ {'EXIT', _} = catch Get([[-1, $a], $a]),
+ {'EXIT', _} = catch Get([<<$a>>, [-1, $a], $a]), %% Current impl
+
0 = fold(fun verify_gc/3, 0, DataDir ++ "/GraphemeBreakTest.txt"),
ok.
diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript
index a677bae2e9..b459141d68 100644
--- a/lib/stdlib/uc_spec/gen_unicode_mod.escript
+++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript
@@ -242,25 +242,26 @@ gen_header(Fd) ->
io:put_chars(Fd, "-compile({inline, [class/1]}).\n"),
io:put_chars(Fd, "-compile(nowarn_unused_vars).\n"),
io:put_chars(Fd, "-dialyzer({no_improper_lists, [cp/1, gc/1, gc_prepend/2]}).\n"),
- io:put_chars(Fd, "-type gc() :: char()|[char()].\n\n\n"),
+ io:put_chars(Fd, "-type gc() :: char()|[char()].\n\n"),
+ io:put_chars(Fd, "-define(IS_CP(CP), (is_integer(CP) andalso 0 =< CP andalso CP < 16#110000)).\n\n\n"),
ok.
gen_static(Fd) ->
io:put_chars(Fd, "-spec lookup(char()) -> #{'canon':=[{byte(),char()}], 'ccc':=byte(), "
"'compat':=[] | {atom(),[{byte(),char()}]}, 'category':={atom(),atom()}}.\n"),
- io:put_chars(Fd, "lookup(Codepoint) when is_integer(Codepoint) ->\n"
+ io:put_chars(Fd, "lookup(Codepoint) when ?IS_CP(Codepoint) ->\n"
" {CCC,Can,Comp,Cat} = unicode_table(Codepoint),\n"
" #{ccc=>CCC, canon=>Can, compat=>Comp, category=>category(Codepoint,Cat)}.\n\n"),
io:put_chars(Fd, "-spec get_case(char()) -> #{'fold':=gc(), 'lower':=gc(), 'title':=gc(), 'upper':=gc()}.\n"),
- io:put_chars(Fd, "get_case(Codepoint) ->\n"
+ io:put_chars(Fd, "get_case(Codepoint) when ?IS_CP(Codepoint) ->\n"
" case case_table(Codepoint) of\n"
" {U,L} -> #{upper=>U,lower=>L,title=>U,fold=>L};\n"
" {U,L,T,F} -> #{upper=>U,lower=>L,title=>T,fold=>F}\n"
" end.\n\n"),
io:put_chars(Fd, "spec_version() -> {14,0}.\n\n\n"),
- io:put_chars(Fd, "class(Codepoint) -> \n"
+ io:put_chars(Fd, "class(Codepoint) when ?IS_CP(Codepoint) -> \n"
" {CCC,_,_,_} = unicode_table(Codepoint),\n CCC.\n\n"),
io:put_chars(Fd, "-spec uppercase(unicode:chardata()) -> "
@@ -316,11 +317,11 @@ gen_static(Fd) ->
io:put_chars(Fd, "%% Returns true if the character is considered wide in non east asian context.\n"),
io:put_chars(Fd, "-spec is_wide(gc()) -> boolean().\n"),
- io:put_chars(Fd, "is_wide(C) when is_integer(C) ->\n"),
+ io:put_chars(Fd, "is_wide(C) when ?IS_CP(C) ->\n"),
io:put_chars(Fd, " is_wide_cp(C);\n"),
io:put_chars(Fd, "is_wide([_, 16#FE0E|Cs]) -> true; %% Presentation sequence\n"),
io:put_chars(Fd, "is_wide([_, 16#FE0F|Cs]) -> true; %% Presentation sequence\n"),
- io:put_chars(Fd, "is_wide([C|Cs]) ->\n"),
+ io:put_chars(Fd, "is_wide([C|Cs]) when ?IS_CP(C) ->\n"),
io:put_chars(Fd, " is_wide_cp(C) orelse is_wide(Cs);\n"),
io:put_chars(Fd, "is_wide([]) ->\n false.\n\n"),
@@ -547,7 +548,7 @@ gen_ws(Fd, Props) ->
gen_cp(Fd) ->
io:put_chars(Fd, "-spec cp(String::unicode:chardata()) ->"
" maybe_improper_list() | {error, unicode:chardata()}.\n"),
- io:put_chars(Fd, "cp([C|_]=L) when is_integer(C) -> L;\n"),
+ io:put_chars(Fd, "cp([C|_]=L) when ?IS_CP(C) -> L;\n"),
io:put_chars(Fd, "cp([List]) -> cp(List);\n"),
io:put_chars(Fd, "cp([List|R]) -> cpl(List, R);\n"),
io:put_chars(Fd, "cp([]) -> [];\n"),
@@ -555,8 +556,8 @@ gen_cp(Fd) ->
io:put_chars(Fd, "cp(<<>>) -> [];\n"),
io:put_chars(Fd, "cp(<<R/binary>>) -> {error,R}.\n"),
io:put_chars(Fd, "\n"),
- io:put_chars(Fd, "cpl([C], R) when is_integer(C) -> [C|cpl_1_cont(R)];\n"),
- io:put_chars(Fd, "cpl([C|T], R) when is_integer(C) -> [C|cpl_cont(T, R)];\n"),
+ io:put_chars(Fd, "cpl([C], R) when ?IS_CP(C) -> [C|cpl_1_cont(R)];\n"),
+ io:put_chars(Fd, "cpl([C|T], R) when ?IS_CP(C) -> [C|cpl_cont(T, R)];\n"),
io:put_chars(Fd, "cpl([List], R) -> cpl(List, R);\n"),
io:put_chars(Fd, "cpl([List|T], R) -> cpl(List, [T|R]);\n"),
io:put_chars(Fd, "cpl([], R) -> cp(R);\n"),
@@ -627,13 +628,13 @@ gen_gc(Fd, GBP) ->
" maybe_improper_list() | {error, unicode:chardata()}.\n"),
io:put_chars(Fd,
"gc([]=R) -> R;\n"
- "gc([CP]=R) when is_integer(CP) -> R;\n"
+ "gc([CP]=R) when ?IS_CP(CP) -> R;\n"
"gc([$\\r=CP|R0]) ->\n"
" case cp(R0) of % Don't break CRLF\n"
" [$\\n|R1] -> [[$\\r,$\\n]|R1];\n"
" T -> [CP|T]\n"
" end;\n"
- "gc([CP1|T1]=T) when CP1 < 256 ->\n"
+ "gc([CP1|T1]=T) when CP1 < 256 andalso ?IS_CP(CP1) ->\n"
" case T1 of\n"
" [CP2|_] when CP2 < 256 -> T; %% Ascii Fast path\n"
" _ -> %% Keep the tail binary.\n"
@@ -653,7 +654,7 @@ gen_gc(Fd, GBP) ->
" end;\n"
" true -> gc_1([CP1|Rest])\n"
" end;\n"
- "gc([CP|_]=T) when is_integer(CP) -> gc_1(T);\n"
+ "gc([CP|_]=T) when ?IS_CP(CP) -> gc_1(T);\n"
"gc(Str) ->\n"
" case cp(Str) of\n"
" {error,_}=Error -> Error;\n"
@@ -865,6 +866,7 @@ gen_gc(Fd, GBP) ->
" _ -> gc_extend2(R1, R0, Acc)\n"
" end\n end.\n\n"),
io:put_chars(Fd, "%% Handle Hangul LV\n"),
+ io:put_chars(Fd, "gc_h_lv_lvt([CP|_], _R0, _Acc) when not ?IS_CP(CP) -> error(badarg);\n"),
GenHangulLV = fun(Range) -> io:format(Fd, "gc_h_lv_lvt~s gc_h_V(R1,[CP|Acc]);\n",
[gen_clause2(Range)]) end,
[GenHangulLV(CP) || CP <- merge_ranges(maps:get(lv,GBP))],
--
2.35.3