File 6812-Export-category-and-some-helpers.patch of Package erlang
From 5abc976b3d276552bbff7551108deefcdd48af46 Mon Sep 17 00:00:00 2001
From: Dan Gudmundsson <dgud@erlang.org>
Date: Mon, 17 Nov 2025 13:33:05 +0100
Subject: [PATCH 2/4] Export category and some helpers
Export category which is needed for some use-cases, it was available
in lookup but created and unnecessary map.
Also added some needed helpers:
is_other_id_start/1,
is_other_id_continue/1,
is_letter_not_pattern_syntax/1.
To help definition of ID_Start and ID_continue
as described in Unicode spec.
---
lib/stdlib/uc_spec/gen_unicode_mod.escript | 128 +++++++++++++++++----
1 file changed, 104 insertions(+), 24 deletions(-)
diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript
index 8cbb131a55..d75ba57bb7 100644
--- a/lib/stdlib/uc_spec/gen_unicode_mod.escript
+++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript
@@ -259,7 +259,7 @@ gen_file(Fd, Data, ExclData, Props, WideCs, UpdateTests) ->
gen_header(Fd),
gen_static(Fd),
gen_norm(Fd),
- gen_ws(Fd, Props),
+ gen_props(Fd, Props, Data),
gen_cp(Fd),
gen_gc(Fd, Props),
gen_compose_pairs(Fd, ExclData, Data),
@@ -269,29 +269,75 @@ gen_file(Fd, Data, ExclData, Props, WideCs, UpdateTests) ->
ok.
gen_header(Fd) ->
- io:put_chars(Fd, "%%\n%% this file is generated do not modify\n"),
- io:put_chars(Fd, "%% see ../uc_spec/gen_unicode_mod.escript\n\n"),
- io:put_chars(Fd, "-module(" ++ ?MOD ++").\n"),
- io:put_chars(Fd, "-export([cp/1, gc/1]).\n"),
- io:put_chars(Fd, "-export([nfd/1, nfc/1, nfkd/1, nfkc/1]).\n"),
- io:put_chars(Fd, "-export([whitespace/0, is_whitespace/1]).\n"),
- io:put_chars(Fd, "-export([uppercase/1, lowercase/1, titlecase/1, casefold/1]).\n\n"),
- io:put_chars(Fd, "-export([spec_version/0, lookup/1, get_case/1]).\n"),
- io:put_chars(Fd, "-export([is_wide/1]).\n"),
- io:put_chars(Fd, "-compile({inline, [class/1]}).\n"),
- io:put_chars(Fd, "-compile(nowarn_unused_vars).\n"),
- io:put_chars(Fd, "-dialyzer({no_improper_lists, [cp/1, gc/1, gc_prepend/2]}).\n"),
- io:put_chars(Fd, "-type gc() :: char()|[char()].\n\n"),
- io:put_chars(Fd, "-define(IS_CP(CP), (is_integer(CP) andalso 0 =< CP andalso CP < 16#110000)).\n\n\n"),
- ok.
+ io:put_chars(Fd,"%%
+%% this file is generated do not modify
+%% see ../uc_spec/gen_unicode_mod.escript
+
+-module(unicode_util).
+-export([cp/1, gc/1]).
+-export([nfd/1, nfc/1, nfkd/1, nfkc/1]).
+-export([whitespace/0, is_whitespace/1]).
+-export([uppercase/1, lowercase/1, titlecase/1, casefold/1]).
+
+-export([spec_version/0, lookup/1, category/1, get_case/1]).
+-export([is_wide/1]).
+-export([is_other_id_start/1, is_other_id_continue/1, is_letter_not_pattern_syntax/1]).
+-compile({inline, [class/1]}).
+-compile(nowarn_unused_vars).
+-dialyzer({no_improper_lists, [cp/1, gc/1, gc_prepend/2]}).
+-type gc() :: char()|[char()].
+-type category() ::
+ {letter,uppercase} |
+ {letter,lowercase} |
+ {letter,titlecase} |
+ {mark,non_spacing} |
+ {mark,spacing_combining} |
+ {mark,enclosing} |
+ {number,decimal} |
+ {number,letter} |
+ {number,other} |
+ {separator,space} |
+ {separator,line} |
+ {separator,paragraph} |
+ {other,control} |
+ {other,format} |
+ {other,surrogate} |
+ {other,private} |
+ {other,not_assigned} |
+ {letter,modifier} |
+ {letter,other} |
+ {punctuation,connector} |
+ {punctuation,dash} |
+ {punctuation,open} |
+ {punctuation,close} |
+ {punctuation,initial} |
+ {punctuation,final} |
+ {punctuation,other} |
+ {symbol,math} |
+ {symbol,currency} |
+ {symbol,modifier} |
+ {symbol,other}.
+
+-define(IS_CP(CP), (is_integer(CP) andalso 0 =< CP andalso CP < 16#110000)).
+"),
+ ok.
gen_static(Fd) ->
- io:put_chars(Fd, "-spec lookup(char()) -> #{'canon':=[{byte(),char()}], 'ccc':=byte(), "
- "'compat':=[] | {atom(),[{byte(),char()}]}, 'category':={atom(),atom()}}.\n"),
- io:put_chars(Fd, "lookup(Codepoint) when ?IS_CP(Codepoint) ->\n"
- " {CCC,Can,Comp,Cat} = unicode_table(Codepoint),\n"
- " #{ccc=>CCC, canon=>Can, compat=>Comp, category=>category(Codepoint,Cat)}.\n\n"),
-
+ io:put_chars(Fd, "-spec lookup(char()) ->
+ #{'canon':= [{byte(),char()}],
+ 'ccc':= byte(),
+ 'compat':= [] | {atom(),[{byte(),char()}]},
+ 'category':= category()}.
+lookup(Codepoint) when ?IS_CP(Codepoint) ->
+ {CCC,Can,Comp,Cat} = unicode_table(Codepoint),
+ #{ccc=>CCC, canon=>Can, compat=>Comp, category=>category(Codepoint,Cat)}.
+
+-spec category(char()) -> category().
+category(Codepoint) when ?IS_CP(Codepoint) ->
+ {_,_,_,Cat} = unicode_table(Codepoint),
+ category(Codepoint,Cat).
+
+"),
io:put_chars(Fd, "-spec get_case(char()) -> #{'fold':=gc(), 'lower':=gc(), 'title':=gc(), 'upper':=gc()}.\n"),
io:put_chars(Fd, "get_case(Codepoint) when ?IS_CP(Codepoint) ->\n"
" case case_table(Codepoint) of\n"
@@ -601,7 +651,7 @@ gen_norm(Fd) ->
ok.
-gen_ws(Fd, Props) ->
+gen_props(Fd, Props, Data) ->
WS0 = maps:get(pattern_white_space, Props),
WS = merge_ranges(WS0, split),
io:put_chars(Fd, "%% Useful non-breakable whitespace chars\n"
@@ -615,6 +665,36 @@ gen_ws(Fd, Props) ->
io:format(Fd, "is_whitespace([13,10]) -> true;\n", []),
[IsWS(CP) || CP <- WS],
io:put_chars(Fd, "is_whitespace(_) -> false.\n\n"),
+
+ OIDS = maps:get(other_id_start, Props),
+ io:put_chars(Fd, "-spec is_other_id_start(gc()) -> boolean().\n"),
+ IsODIS = fun(Range) -> io:format(Fd, "is_other_id_start~s true;\n", [gen_single_clause(Range)]) end,
+ [IsODIS(CP) || CP <- OIDS],
+ io:put_chars(Fd, "is_other_id_start(_) -> false.\n\n"),
+
+ OICS = maps:get(other_id_continue, Props),
+ io:put_chars(Fd, "-spec is_other_id_continue(gc()) -> boolean().\n"),
+ IsOICS = fun(Range) -> io:format(Fd, "is_other_id_continue~s true;\n", [gen_single_clause(Range)]) end,
+ [IsOICS(CP) || CP <- OICS],
+ io:put_chars(Fd, "is_other_id_continue(_) -> false.\n\n"),
+
+ PS0 = maps:get(pattern_syntax, Props),
+ io:put_chars(Fd, "-spec is_letter_not_pattern_syntax(gc()) -> boolean().\n"),
+ IsNLPS = fun(Range) -> io:format(Fd, "is_letter_not_pattern_syntax~s false;\n", [gen_single_clause(Range)]) end,
+ KeepCat = fun(CP) ->
+ case array:get(CP, Data) of
+ #cp{cat = [$P,_]} -> false;
+ #cp{cat = [$S,_]} -> false;
+ #cp{cat = [$C,_]} -> false;
+ undefined -> false;
+ _ -> true
+ end
+ end,
+ PS = [{PSC, undefined} || {PSC, undefined} <- split_ranges(PS0, []), KeepCat(PSC)],
+ %% [io:format("~p ~p~n", [P, (array:get(P, Data))#cp.cat]) || {P,_} <- PS],
+ [IsNLPS(CP) || CP <- PS],
+ io:put_chars(Fd, "is_letter_not_pattern_syntax(_) -> true.\n\n"),
+
ok.
gen_cp(Fd) ->
--
2.51.0