File 2051-stdlib-Add-classifying-functions.patch of Package erlang
From ca7e19b8723b36bc96c3127eae4e54d25685197f Mon Sep 17 00:00:00 2001
From: Dan Gudmundsson <dgud@erlang.org>
Date: Tue, 18 Nov 2025 12:27:05 +0100
Subject: [PATCH] stdlib: Add classifying functions
When making scanners there might be a need to classify codepoints if
the specification follows the Unicode standard.
The following:
is_whitespace/1,
is_ID_start/1,
is_ID_continue/1,
category/1
is (at least) needed to scan and parse for example markdown.
This commit is backward incompatible if the *undocumented*
unicode_util:[is_]whitespace(..) functions have been used.
unicode_util:whitespace() have been changed to
unicode_util:pattern_whitespace() string module uses it and documents
that is uses the `pattern_whitespace` property.
So the name of the function have been changed, this is because
`pattern_whitespace` is not a subset of `whitespace` characters,
as was discovered during testing.
is_whitespace(Char) have been changed to use the Unicode `whitespace`
property.
See Unicode Standard Annex #31 and #44.
---
lib/stdlib/src/erl_stdlib_errors.erl | 12 +-
lib/stdlib/src/string.erl | 4 +-
lib/stdlib/src/unicode.erl | 158 ++++++++++++++++++++-
lib/stdlib/test/Makefile | 3 +
lib/stdlib/test/unicode_SUITE.erl | 154 +++++++++++++++++++-
lib/stdlib/test/unicode_util_SUITE.erl | 6 +-
lib/stdlib/uc_spec/gen_unicode_mod.escript | 17 ++-
7 files changed, 333 insertions(+), 21 deletions(-)
diff --git a/lib/stdlib/src/erl_stdlib_errors.erl b/lib/stdlib/src/erl_stdlib_errors.erl
index 49db693d57..31503da214 100644
--- a/lib/stdlib/src/erl_stdlib_errors.erl
+++ b/lib/stdlib/src/erl_stdlib_errors.erl
@@ -428,7 +428,15 @@ format_unicode_error(characters_to_nfkc_list, [_]) ->
format_unicode_error(characters_to_nfkd_binary, [_]) ->
[bad_char_data];
format_unicode_error(characters_to_nfkd_list, [_]) ->
- [bad_char_data].
+ [bad_char_data];
+format_unicode_error(category, [_]) ->
+ [bad_char];
+format_unicode_error(is_whitespace, [_]) ->
+ [bad_char];
+format_unicode_error(is_id_start, [_]) ->
+ [bad_char];
+format_unicode_error(is_id_continue, [_]) ->
+ [bad_char].
unicode_char_data(Chars) ->
try unicode:characters_to_binary(Chars) of
@@ -1121,6 +1129,8 @@ expand_error(bad_boolean) ->
<<"not a boolean value">>;
expand_error(bad_binary_list) ->
<<"not a flat list of binaries">>;
+expand_error(bad_char) ->
+ <<"not a valid character">>;
expand_error(bad_char_data) ->
<<"not valid character data (an iodata term)">>;
expand_error(bad_binary_pattern) ->
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index bfcec6e6ca..ad61855ba0 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -477,7 +477,7 @@ pad(CD, Length, both, Char) when is_integer(Length) ->
-spec trim(String) -> unicode:chardata() when
String :: unicode:chardata().
trim(Str) ->
- trim(Str, both, unicode_util:whitespace()).
+ trim(Str, both, unicode_util:pattern_whitespace()).
-doc """
Equivalent to [`trim(String, Dir, Whitespace})`](`trim/3`) where
@@ -490,7 +490,7 @@ as Pattern_White_Space in
String :: unicode:chardata(),
Dir :: direction() | 'both'.
trim(Str, Dir) ->
- trim(Str, Dir, unicode_util:whitespace()).
+ trim(Str, Dir, unicode_util:pattern_whitespace()).
-doc """
Returns a string, where leading or trailing, or both, `Characters` have been
diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl
index 376444d930..7186770a60 100644
--- a/lib/stdlib/src/unicode.erl
+++ b/lib/stdlib/src/unicode.erl
@@ -21,7 +21,7 @@
%%
-module(unicode).
-moduledoc """
-Functions for converting Unicode characters.
+Functions for converting and classifying Unicode characters.
This module contains functions for converting between different character
representations. It converts between ISO Latin-1 characters and Unicode
@@ -71,9 +71,12 @@ normalization can be found in the
characters_to_nfkc_list/1, characters_to_nfkc_binary/1
]).
+-export([is_whitespace/1, is_id_start/1, is_id_continue/1, category/1]).
+
-export_type([chardata/0, charlist/0, encoding/0, external_chardata/0,
external_charlist/0, latin1_char/0, latin1_chardata/0,
- latin1_charlist/0, latin1_binary/0, unicode_binary/0]).
+ latin1_charlist/0, latin1_binary/0, unicode_binary/0,
+ category/0]).
-type encoding() :: 'latin1' | 'unicode' | 'utf8'
| 'utf16' | {'utf16', endian()}
@@ -108,6 +111,15 @@ than UTF-8 (that is, UTF-16 or UTF-32).
latin1_binary() |
latin1_charlist(),
latin1_binary() | nil()).
+-doc "Character category".
+-type category() ::
+ {letter, uppercase | lowercase | titlecase | modifier | other} |
+ {mark, non_spacing | spacing_combining | enclosing} |
+ {number, decimal | letter | other} |
+ {separator, space | line | paragraph} |
+ {other, control | format | surrogate | private | not_assigned} |
+ {punctuation, connector | dash | open | close | initial | final | other} |
+ {symbol, math | currency | modifier | other}.
%% We must inline these functions so that the stacktrace points to
%% the correct function.
@@ -122,6 +134,8 @@ than UTF-8 (that is, UTF-16 or UTF-32).
-export([bin_is_7bit/1, characters_to_binary/2, characters_to_list/2]).
+-define(IS_CP(CP), (is_integer(CP) andalso 0 =< CP andalso CP =< 16#10FFFF)).
+
-doc false.
-spec bin_is_7bit(Binary) -> boolean() when
Binary :: binary().
@@ -681,13 +695,149 @@ characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
characters_to_nfkc_binary(CD, _, Row, Acc) ->
characters_to_nfkc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
+-doc """
+Returns true if `Char` is a whitespace.
+
+Whitespace is defined in
+[Unicode Standard Annex #44](http://unicode.org/reports/tr44/).
+
+```erlang
+1> unicode:is_whitespace($\s).
+true
+2> unicode:is_whitespace($😊).
+false
+```
+""".
+-doc(#{since => ~"@OTP-19858@"}).
+-spec is_whitespace(char()) -> boolean().
+is_whitespace(X) %% ASCII (and low number) Optimizations
+ when X =:= 9; X =:= 10; X =:= 11; X =:= 12; X =:= 13; X =:= 32;
+ X =:= 133; X =:= 160 ->
+ true;
+is_whitespace(Char) when is_integer(Char), 0 =< Char, Char =< 5000 -> %% Arbitrary limit without whitespace
+ false;
+is_whitespace(Char) when ?IS_CP(Char) ->
+ unicode_util:is_whitespace(Char);
+is_whitespace(Term) ->
+ badarg_with_info([Term]).
+
+
+-doc """
+Returns true if `Char` is an identifier start.
+
+Identifier start is defined by the ID_Start property in
+[Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1).
+
+```erlang
+1> unicode:is_id_start($a).
+true
+2> unicode:is_id_start($_).
+false
+3> unicode:is_id_start($-).
+false
+```
+""".
+-doc(#{since => ~"@OTP-19858@"}).
+-spec is_id_start(char()) -> boolean().
+is_id_start(X) %% ASCII optimizations
+ when X =:= 65; X =:= 66; X =:= 67; X =:= 68; X =:= 69; X =:= 70; X =:= 71;
+ X =:= 72; X =:= 73; X =:= 74; X =:= 75; X =:= 76; X =:= 77; X =:= 78;
+ X =:= 79; X =:= 80; X =:= 81; X =:= 82; X =:= 83; X =:= 84; X =:= 85;
+ X =:= 86; X =:= 87; X =:= 88; X =:= 89; X =:= 90; X =:= 97; X =:= 98;
+ X =:= 99; X =:= 100; X =:= 101; X =:= 102; X =:= 103; X =:= 104; X =:= 105;
+ X =:= 106; X =:= 107; X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112;
+ X =:= 113; X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119;
+ X =:= 120; X =:= 121; X =:= 122 ->
+ true;
+is_id_start(Char) when is_integer(Char), 0 =< Char, Char =< 127 ->
+ false;
+is_id_start(Char) when ?IS_CP(Char) ->
+ case unicode_util:category(Char) of
+ {number,letter} -> true;
+ {letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char);
+ {letter,_} -> true;
+ {_,_} -> unicode_util:is_other_id_start(Char)
+ end;
+is_id_start(Term) ->
+ badarg_with_info([Term]).
+
+
+-doc """
+Returns true if `Char` is an identifier continuation.
+
+Identifier continuation is defined by the ID_Continue property in
+[Unicode Standard Annex #31](https://unicode.org/reports/tr31/#D1).
+
+```erlang
+1> unicode:is_id_continue($a).
+true
+2> unicode:is_id_continue($_).
+true
+3> unicode:is_id_continue($-).
+false
+```
+""".
+-doc(#{since => ~"@OTP-19858@"}).
+-spec is_id_continue(char()) -> boolean().
+is_id_continue(X)
+ when X =:= 48; X =:= 49; X =:= 50; X =:= 51; X =:= 52; X =:= 53; X =:= 54;
+ X =:= 55; X =:= 56; X =:= 57; X =:= 65; X =:= 66; X =:= 67; X =:= 68;
+ X =:= 69; X =:= 70; X =:= 71; X =:= 72; X =:= 73; X =:= 74; X =:= 75;
+ X =:= 76; X =:= 77; X =:= 78; X =:= 79; X =:= 80; X =:= 81; X =:= 82;
+ X =:= 83; X =:= 84; X =:= 85; X =:= 86; X =:= 87; X =:= 88; X =:= 89;
+ X =:= 90; X =:= 95; X =:= 97; X =:= 98; X =:= 99; X =:= 100; X =:= 101;
+ X =:= 102; X =:= 103; X =:= 104; X =:= 105; X =:= 106; X =:= 107;
+ X =:= 108; X =:= 109; X =:= 110; X =:= 111; X =:= 112; X =:= 113;
+ X =:= 114; X =:= 115; X =:= 116; X =:= 117; X =:= 118; X =:= 119;
+ X =:= 120; X =:= 121; X =:= 122 ->
+ true;
+is_id_continue(Char) when is_integer(Char), 0 =< Char, Char =< 127 ->
+ false;
+is_id_continue(Char) when ?IS_CP(Char) ->
+ case unicode_util:category(Char) of
+ {punctuation, connector} -> true;
+ {mark,non_spacing} -> true;
+ {mark,spacing_combining} -> true;
+ {number,other} -> unicode_util:is_other_id_continue(Char);
+ {number,_} -> true;
+ {letter,modifier} -> unicode_util:is_letter_not_pattern_syntax(Char);
+ {letter,_} -> true;
+ {_,_} -> unicode_util:is_other_id_start(Char) orelse
+ unicode_util:is_other_id_continue(Char)
+ end;
+is_id_continue(Term) ->
+ badarg_with_info([Term]).
+
+-doc """
+Returns the `Char` category.
+
+```erlang
+1> unicode:category($a).
+{letter,lowercase}
+2> unicode:category($Ä).
+{letter,uppercase}
+3> unicode:category($😊).
+{symbol,other}
+4> unicode:category($€).
+{symbol,currency}
+5> unicode:category($[).
+{punctuation,open}
+```
+""".
+-doc(#{since => ~"@OTP-19858@"}).
+-spec category(char()) -> category().
+category(Char) when ?IS_CP(Char) ->
+ unicode_util:category(Char);
+category(Term) ->
+ badarg_with_info([Term]).
+
+%% internals
+
acc_to_binary(Acc) ->
list_to_binary(lists:reverse(Acc)).
prepend_row_to_acc(Row, Acc) ->
[characters_to_binary(lists:reverse(Row))|Acc].
-%% internals
-
-doc false.
characters_to_list_int(ML, Encoding) ->
try
diff --git a/lib/stdlib/test/Makefile b/lib/stdlib/test/Makefile
index 1ce324b46c..7df2b43eac 100644
--- a/lib/stdlib/test/Makefile
+++ b/lib/stdlib/test/Makefile
@@ -197,6 +197,9 @@ release_tests_spec: make_emakefile
$(ERL_FILES) $(COVERFILE) $(EXTRA_FILES) "$(RELSYSDIR)"
chmod -R u+w "$(RELSYSDIR)"
@tar cf - *_SUITE_data property_test | (cd "$(RELSYSDIR)"; tar xf -)
+ $(INSTALL_DIR) "$(RELSYSDIR)/unicode_SUITE_data"
+ $(INSTALL_DATA) ../uc_spec/PropList.txt ../uc_spec/DerivedCoreProperties.txt \
+ "$(RELSYSDIR)/unicode_SUITE_data"
$(INSTALL_DIR) "$(RELSYSDIR)/stdlib_SUITE_data"
$(INSTALL_DATA) $(ERL_TOP)/make/otp_version_tickets "$(RELSYSDIR)/stdlib_SUITE_data"
diff --git a/lib/stdlib/test/unicode_SUITE.erl b/lib/stdlib/test/unicode_SUITE.erl
index 62306214a0..886b073854 100644
--- a/lib/stdlib/test/unicode_SUITE.erl
+++ b/lib/stdlib/test/unicode_SUITE.erl
@@ -39,14 +39,17 @@
normalize/1,
huge_illegal_code_points/1,
bin_is_7bit/1,
- error_info/1
+ error_info/1,
+ is_whitespace/1,
+ category/1,
+ is_id/1
]).
suite() ->
[{ct_hooks,[ts_install_cth]},
{timetrap,{minutes,20}}].
-all() ->
+all() ->
[utf8_illegal_sequences_bif,
utf16_illegal_sequences_bif, random_lists, roundtrips,
latin1, exceptions,
@@ -55,6 +58,7 @@ all() ->
{group,binaries_errors},
huge_illegal_code_points,
bin_is_7bit,
+ {group, classify},
error_info].
groups() ->
@@ -63,7 +67,12 @@ groups() ->
ex_binaries_errors_utf16_little,
ex_binaries_errors_utf16_big,
ex_binaries_errors_utf32_little,
- ex_binaries_errors_utf32_big]}].
+ ex_binaries_errors_utf32_big]},
+ {classify, [parallel],
+ [is_whitespace,
+ category,
+ is_id]}
+ ].
binaries_errors_limit(Config) when is_list(Config) ->
setlimit(10),
@@ -1458,6 +1467,15 @@ error_info(_Config) ->
{characters_to_nfkd_list, [abc]},
{characters_to_nfkd_list, [<<1:11>>]},
+ {category, [-1]},
+ {category, [foobar]},
+
+ {is_whitespace, [-1]},
+ {is_whitespace, [foobar]},
+
+ {is_id_start, [-1]},
+ {is_id_continue, [foobar]},
+
%% Not BIFs (they don't throw badarg when they fail).
{bom_to_encoding, 1}, %Not BIF.
{encoding_to_bom, 1}, %Not BIF.
@@ -1469,13 +1487,139 @@ error_info(_Config) ->
],
error_info_lib:test_error_info(unicode, L).
+
+-define(MAX_CHAR, 16#10FFFF).
+category(_Config) ->
+ Check = fun(Id) ->
+ LC = maps:get(category, unicode_util:lookup(Id)),
+ LC == unicode:category(Id)
+ end,
+ [] = [Id || Id <- lists:seq(1, ?MAX_CHAR), not Check(Id)],
+ {'EXIT', _} = catch unicode:category(-1),
+ {'EXIT', _} = catch unicode:category(5000000),
+ {'EXIT', _} = catch unicode:category(foobar),
+ ok.
+
+is_whitespace(Config) ->
+ Props = parse_properties(filename:join(proplists:get_value(data_dir, Config), "PropList.txt")),
+ WhiteSpaces = maps:get(white_space, Props),
+ Set = make_set(WhiteSpaces),
+ Test = fun(Char) ->
+ case {unicode:is_whitespace(Char), sets:is_element(Char, Set)} of
+ {X,X} -> false;
+ _ -> true
+ end
+ end,
+ [] = [{Char, integer_to_list(Char, 16), unicode_util:lookup(Char)}
+ || Char <- lists:seq(1, ?MAX_CHAR), Test(Char)],
+ ok.
+
+is_id(Config) ->
+ Props = parse_properties(filename:join(proplists:get_value(data_dir, Config),
+ "DerivedCoreProperties.txt")),
+ [] = id_start(Props),
+ [] = id_cont(Props),
+ ok.
+
+id_start(Props) ->
+ ID_Start = maps:get(id_start, Props),
+ Set = make_set(ID_Start),
+
+ TestStart = fun(Char) ->
+ case {unicode:is_id_start(Char), sets:is_element(Char, Set)} of
+ {X,X} -> false;
+ _ -> true
+ end
+ end,
+
+ [{Char, integer_to_list(Char, 16), unicode_util:lookup(Char)}
+ || Char <- lists:seq(1, ?MAX_CHAR), TestStart(Char)].
+
+id_cont(Props) ->
+ ID_Cont = maps:get(id_continue, Props),
+ Set = make_set(ID_Cont),
+ TestCont = fun(Char) ->
+ case {unicode:is_id_continue(Char), sets:is_element(Char, Set)} of
+ {X,X} -> false;
+ _ -> true
+ end
+ end,
+ [{Char, integer_to_list(Char, 16), unicode_util:lookup(Char)}
+ || Char <- lists:seq(1, ?MAX_CHAR), TestCont(Char)].
+
+
%%%
%%% Utilities.
%%%
-id(I) -> I.
-
setlimit(X) ->
erts_debug:set_internal_state(available_internal_state,true),
io:format("Setting loop limit, old: ~p, now set to ~p~n",
[erts_debug:set_internal_state(unicode_loop_limit,X),X]).
+
+make_set(ListOfRanges) ->
+ List = lists:foldl(fun add_range/2, [], ListOfRanges),
+ sets:from_list(List).
+
+add_range({A,undefined}, Acc) ->
+ [A|Acc];
+add_range({A,B}, Acc) ->
+ lists:seq(A,B) ++ Acc.
+
+parse_properties(File) ->
+ {ok, Fd} = file:open(File, [read, raw, {read_ahead, 1000000}]),
+ Props0 = foldl(fun parse_properties/2, [], Fd),
+ file:close(Fd),
+ Props1 = sofs:to_external(sofs:relation_to_family(sofs:relation(Props0))),
+ maps:from_list(Props1).
+
+parse_properties(Line0, Acc) ->
+ [Line|_Comments] = tokens(Line0, "#"),
+ [CodePoints, Class | _] = tokens(Line, ";"),
+ case tokens(CodePoints, ".") of
+ [CodePoint] ->
+ [{to_atom(Class), {hex_to_int(CodePoint), undefined}}|Acc];
+ [CodePoint1,"",CodePoint2] ->
+ [{to_atom(Class), {hex_to_int(CodePoint1), hex_to_int(CodePoint2)}}|Acc]
+ end.
+
+hex_to_int([]) -> [];
+hex_to_int(HexStr) ->
+ list_to_integer(string:trim(HexStr, both), 16).
+
+to_atom(Str) ->
+ list_to_atom(string:lowercase(string:trim(Str, both))).
+
+foldl(Fun, Acc, Fd) ->
+ Get = fun() -> file:read_line(Fd) end,
+ foldl_1(Fun, Acc, Get).
+
+foldl_1(_Fun, {done, Acc}, _Get) -> Acc;
+foldl_1(Fun, Acc, Get) ->
+ case Get() of
+ eof -> Acc;
+ {ok, "#" ++ _} -> %% Ignore comments
+ foldl_1(Fun, Acc, Get);
+ {ok, "\n"} -> %% Ignore empty lines
+ foldl_1(Fun, Acc, Get);
+ {ok, Line} ->
+ foldl_1(Fun, Fun(Line, Acc), Get)
+ end.
+
+%% Differs from string:lexemes, it returns empty string as token between two delimiters
+tokens(S, [C]) ->
+ tokens(lists:reverse(S), C, []).
+
+tokens([Sep|S], Sep, Toks) ->
+ tokens(S, Sep, [[]|Toks]);
+tokens([C|S], Sep, Toks) ->
+ tokens_2(S, Sep, Toks, [C]);
+tokens([], _, Toks) ->
+ Toks.
+
+tokens_2([Sep|S], Sep, Toks, Tok) ->
+ tokens(S, Sep, [Tok|Toks]);
+tokens_2([C|S], Sep, Toks, Tok) ->
+ tokens_2(S, Sep, Toks, [C|Tok]);
+tokens_2([], _Sep, Toks, Tok) ->
+ [Tok|Toks].
diff --git a/lib/stdlib/test/unicode_util_SUITE.erl b/lib/stdlib/test/unicode_util_SUITE.erl
index dbc7e21d11..6832196c13 100644
--- a/lib/stdlib/test/unicode_util_SUITE.erl
+++ b/lib/stdlib/test/unicode_util_SUITE.erl
@@ -95,8 +95,10 @@ casefold(_) ->
whitespace(_Config) ->
%% Pattern whitespace
- WS = unicode_util:whitespace(),
- WS = lists:filter(fun unicode_util:is_whitespace/1, WS),
+ WS = lists:sort(unicode_util:pattern_whitespace()),
+ %% is_whitespace are an extended subset of pattern_whitespace
+ %% (more tested in the unicode module)
+ WS = lists:sort(lists:filter(fun unicode_util:is_whitespace/1, WS) ++ [8206,8207]),
false = unicode_util:is_whitespace($A),
ok.
diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript
index 35c3f05ce2..1d6507bf69 100644
--- a/lib/stdlib/uc_spec/gen_unicode_mod.escript
+++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript
@@ -278,7 +278,7 @@ gen_header(Fd) ->
-moduledoc false.
-export([cp/1, gc/1]).
-export([nfd/1, nfc/1, nfkd/1, nfkc/1]).
--export([whitespace/0, is_whitespace/1]).
+-export([pattern_whitespace/0, is_whitespace/1]).
-export([uppercase/1, lowercase/1, titlecase/1, casefold/1]).
-export([spec_version/0, lookup/1, category/1, get_case/1]).
@@ -657,13 +657,16 @@ gen_norm(Fd) ->
ok.
gen_props(Fd, Props, Data) ->
- WS0 = maps:get(pattern_white_space, Props),
- WS = merge_ranges(WS0, split),
+ PWS0 = maps:get(pattern_white_space, Props),
+ PWS = merge_ranges(PWS0, split),
io:put_chars(Fd, "%% Useful non-breakable whitespace chars\n"
"%% defined as Pattern White Space in Unicode Standard Annex #31\n"),
- io:put_chars(Fd, "-spec whitespace() -> [gc()].\n"),
- WsChars = [CP || {CP, undefined} <- WS],
- io:format(Fd, "whitespace() -> ~w.\n\n", [[[$\r,$\n]|WsChars]]),
+ io:put_chars(Fd, "-spec pattern_whitespace() -> [gc()].\n"),
+ WsChars = [CP || {CP, undefined} <- PWS],
+ io:format(Fd, "pattern_whitespace() -> ~w.\n\n", [[[$\r,$\n]|WsChars]]),
+
+ WS0 = maps:get(white_space, Props),
+ WS = merge_ranges(WS0, split),
io:put_chars(Fd, "-spec is_whitespace(gc()) -> boolean().\n"),
IsWS = fun(Range) -> io:format(Fd, "is_whitespace~s true;\n", [gen_single_clause(Range)]) end,
--
2.51.0