File 0694-Fix-xml-regexp-bug-in-XSD-validation.patch of Package erlang
From 88f2ef99b1033c3add6ea5c1aacf05857af2677e Mon Sep 17 00:00:00 2001
From: Lars Thorsen <lars@erlang.org>
Date: Thu, 4 Dec 2025 15:40:23 +0100
Subject: [PATCH] Fix xml regexp bug in XSD validation
XML regular expression evaluator in XSD validation didn't handle
'\s' and '\S' correctly.
Removed some compiler warnings in test code.
---
.gitignore | 2 +
lib/xmerl/doc/examples/xmerl_test.erl | 148 +-
lib/xmerl/src/Makefile | 8 +-
lib/xmerl/src/xmerl.app.src | 3 +-
lib/xmerl/src/xmerl_regexp.erl | 1444 -----------------
lib/xmerl/src/xmerl_xsd_re.erl | 142 ++
lib/xmerl/src/xmerl_xsd_re_parse.yrl | 580 +++++++
lib/xmerl/src/xmerl_xsd_type.erl | 131 +-
lib/xmerl/test/xmerl_SUITE.erl | 1 -
lib/xmerl/test/xmerl_xsd_SUITE.erl | 12 +-
.../xmerl_xsd_SUITE_data/ticket_19762.xml | 4 +
.../xmerl_xsd_SUITE_data/ticket_19762.xsd | 26 +
12 files changed, 913 insertions(+), 1588 deletions(-)
delete mode 100644 lib/xmerl/src/xmerl_regexp.erl
create mode 100644 lib/xmerl/src/xmerl_xsd_re.erl
create mode 100644 lib/xmerl/src/xmerl_xsd_re_parse.yrl
create mode 100644 lib/xmerl/test/xmerl_xsd_SUITE_data/ticket_19762.xml
create mode 100644 lib/xmerl/test/xmerl_xsd_SUITE_data/ticket_19762.xsd
diff --git a/lib/xmerl/doc/examples/xmerl_test.erl b/lib/xmerl/doc/examples/xmerl_test.erl
index f180d9b81f..44952068b4 100644
--- a/lib/xmerl/doc/examples/xmerl_test.erl
+++ b/lib/xmerl/doc/examples/xmerl_test.erl
@@ -28,7 +28,7 @@ test3() ->
io:format("From xmerl:export/2 xmerl_html filter~n ~p~n", [B]),
C = xmerl:export([A], xmerl_text),
io:format("From xmerl:export/2 xmerl_text filter~n ~p~n", [C]).
-
+
test4() ->
FetchFun = fun(_DTDSpec, S) -> {ok, not_fetched, S} end,
@@ -61,14 +61,14 @@ test6() ->
simple() ->
- [{document,
+ [{document,
[{title, ["Doc Title"]},
{author, ["Ulf Wiger"]},
{section,[{heading, ["heading1"]},
{'P', ["This is a paragraph of text."]},
{section,[{heading, ["heading2"]},
{'P', ["This is another paragraph."]},
- {table,[{border, ["1"]},
+ {table,[{border, ["1"]},
{heading,[{col, ["head1"]},
{col, ["head2"]}]},
{row, [{col, ["col11"]},
@@ -166,7 +166,7 @@ w3cvalidate() ->
C = xmerl:export([A], xmerl_test),
io:format("From xmerl:export/2 xmerl_text filter~n ~p~n", [C])
end.
-
+
'TESTSUITE'(_Data, Attrs, _Parents, _E) ->
_Profile = find_attribute('PROFILE', Attrs),
@@ -186,7 +186,7 @@ w3cvalidate() ->
Id = find_attribute('ID', Attrs),
io:format("Test: ~p ",[Id]),
Entities = find_attribute('ENTITIES', Attrs), % Always handle all entities
- Output1 = find_attribute('OUTPUT', Attrs), %
+ Output1 = find_attribute('OUTPUT', Attrs), %
Output3 = find_attribute('OUTPUT3', Attrs), % FIXME!
Sections = find_attribute('SECTIONS', Attrs),
Recommendation = find_attribute('RECOMMENDATION', Attrs), % FIXME!
@@ -253,18 +253,18 @@ test_valid(URI, Data, Sections, Entities, OutputForm, Recommendation, Version,
print_error({Res, Tail}, URI, Sections, Entities, OutputForm,
Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end;
Error ->
print_error(Error, URI, Sections, Entities, OutputForm, Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end,
io:format("validating ", []),
case validating_parser_q(URI) of
@@ -277,18 +277,18 @@ test_valid(URI, Data, Sections, Entities, OutputForm, Recommendation, Version,
print_error({Res2, Tail2}, URI, Sections, Entities, OutputForm,
Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end;
Error2 ->
print_error(Error2, URI, Sections, Entities, OutputForm, Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end.
@@ -307,18 +307,18 @@ test_invalid(URI, Data, Sections, Entities, OutputForm, Recommendation, Version,
print_error({Res, Tail}, URI, Sections, Entities, OutputForm,
Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end;
Error ->
print_error(Error, URI, Sections, Entities, OutputForm, Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end,
io:format("validating ", []),
case validating_parser_q(URI) of
@@ -331,18 +331,18 @@ test_invalid(URI, Data, Sections, Entities, OutputForm, Recommendation, Version,
print_error({Res2, Tail2}, URI, Sections, Entities, OutputForm,
Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end;
{error, enoent} ->
print_error("Testfile not found", URI, Sections, Entities, OutputForm,
Recommendation, Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end;
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end;
_Error2 ->
io:format("OK~n", []),
ok
@@ -363,18 +363,18 @@ test_notwf(URI, Data, Sections, Entities, OutputForm, Recommendation, Version,
print_error({Res, Tail}, URI, Sections, Entities, OutputForm,
Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end;
{error,enoent} ->
print_error("Testfile not found",URI,Sections,Entities,OutputForm,
Recommendation,Version,Namespace,Data),
- if
- ?CONT==false -> throw({'EXIT', failed_test});
- true -> error
- end;
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end;
_Error ->
io:format("OK ",[]),
ok
@@ -390,18 +390,18 @@ test_notwf(URI, Data, Sections, Entities, OutputForm, Recommendation, Version,
print_error({Res2, Tail2}, URI, Sections, Entities, OutputForm,
Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end;
{error,enoent} ->
print_error("Testfile not found", URI, Sections, Entities, OutputForm,
Recommendation, Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end;
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end;
_Error2 ->
io:format("OK~n", []),
ok
@@ -418,17 +418,17 @@ test_error(URI, Data, Sections, Entities, OutputForm, Recommendation, Version,
{error, enoent} ->
print_error("Testfile not found", URI, Sections, Entities, OutputForm,
Recommendation, Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end;
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end;
Res ->
print_error(Res, URI, Sections, Entities, OutputForm, Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end,
io:format("validating ", []),
case validating_parser_q(URI) of
@@ -438,17 +438,17 @@ test_error(URI, Data, Sections, Entities, OutputForm, Recommendation, Version,
{error, enoent} ->
print_error("Testfile not found", URI, Sections, Entities, OutputForm,
Recommendation, Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end;
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end;
Res2 ->
print_error(Res2, URI, Sections, Entities, OutputForm, Recommendation,
Version, Namespace, Data),
- if
- ?CONT == false -> throw({'EXIT', failed_test});
- true -> error
- end
+ case ?CONT of
+ false -> throw({'EXIT', failed_test});
+ true -> error
+ end
end.
@@ -506,10 +506,10 @@ print_error(Error, URI, Sections, Entities, OutputForm, Recommendation, Version,
io:format(Data).
-
-
-
-
+
+
+
+
@@ -521,5 +521,3 @@ para(_Data, _Attrs, US) ->
Int when is_integer(Int) -> Int+1;
undefined -> 1
end.
-
-
diff --git a/lib/xmerl/src/Makefile b/lib/xmerl/src/Makefile
index 56eb876c65..69aa4dc6fb 100644
--- a/lib/xmerl/src/Makefile
+++ b/lib/xmerl/src/Makefile
@@ -68,7 +68,6 @@ MODULES = $(EDOC_MODULES) \
xmerl_html \
xmerl_lib \
xmerl_otpsgml \
- xmerl_regexp \
xmerl_sgml \
xmerl_simple \
xmerl_text \
@@ -90,7 +89,9 @@ MODULES = $(EDOC_MODULES) \
xmerl_sax_parser_utf16be \
xmerl_sax_parser_utf16le \
xmerl_sax_simple_dom \
- xmerl_sax_old_dom
+ xmerl_sax_old_dom \
+ xmerl_xsd_re \
+ xmerl_xsd_re_parse
@@ -185,6 +186,9 @@ xmerl_xpath_parse.erl: xmerl_xpath_parse.yrl
xmerl_b64Bin.erl: xmerl_b64Bin.yrl
$(yecc_verbose)$(ERLC) -o $(ESRC) $(DETERMINISM_FLAG) $<
+xmerl_xsd_re_parse.erl: xmerl_xsd_re_parse.yrl
+ $(yecc_verbose)$(ERLC) -o $(ESRC) $(DETERMINISM_FLAG) $<
+
xmerl_sax_parser_list.erl: xmerl_sax_parser_list.erlsrc xmerl_sax_parser_base.erlsrc
$(gen_verbose)cat xmerl_sax_parser_list.erlsrc xmerl_sax_parser_base.erlsrc >$@
diff --git a/lib/xmerl/src/xmerl.app.src b/lib/xmerl/src/xmerl.app.src
index 67511902cf..2881aa6766 100644
--- a/lib/xmerl/src/xmerl.app.src
+++ b/lib/xmerl/src/xmerl.app.src
@@ -9,7 +9,6 @@
xmerl_html,
xmerl_lib,
xmerl_otpsgml,
- xmerl_regexp,
xmerl_sax_parser,
xmerl_sax_parser_list,
xmerl_sax_parser_latin1,
@@ -35,6 +34,8 @@
xmerl_xpath_scan,
xmerl_xs,
xmerl_xsd,
+ xmerl_xsd_re,
+ xmerl_xsd_re_parse,
xmerl_xsd_type
]},
diff --git a/lib/xmerl/src/xmerl_regexp.erl b/lib/xmerl/src/xmerl_regexp.erl
deleted file mode 100644
index bfca3b50df..0000000000
--- a/lib/xmerl/src/xmerl_regexp.erl
+++ /dev/null
@@ -1,1443 +0,0 @@
-%%
-%% %CopyrightBegin%
-%%
-%% Copyright Ericsson AB 2006-2025. All Rights Reserved.
-%%
-%% Licensed under the Apache License, Version 2.0 (the "License");
-%% you may not use this file except in compliance with the License.
-%% You may obtain a copy of the License at
-%%
-%% http://www.apache.org/licenses/LICENSE-2.0
-%%
-%% Unless required by applicable law or agreed to in writing, software
-%% distributed under the License is distributed on an "AS IS" BASIS,
-%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-%% See the License for the specific language governing permissions and
-%% limitations under the License.
-%%
-%% %CopyrightEnd%
-%%
-
-%%
--module(xmerl_regexp).
-
-%% This module provides a basic set of regular expression functions
-%% for strings. The functions provided are taken from AWK.
-%%
-%% Note that we interpret the syntax tree of a regular expression
-%% directly instead of converting it to an NFA and then interpreting
-%% that. This method seems to go significantly faster.
-
--export([sh_to_awk/1,parse/1,format_error/1,match/2,first_match/2,matches/2]).
--export([sub/3,gsub/3,split/2,sub_match/2,sub_first_match/2]).
-
--export([make_nfa/1,make_dfa/1,make_dfa/2,compile/1]).
-
--import(string, [substr/2,substr/3]).
--import(lists, [reverse/1,reverse/2,last/1,duplicate/2,seq/2]).
--import(lists, [member/2,keysearch/3,keysort/2,map/2,foldl/3]).
--import(ordsets, [is_element/2,add_element/2,union/2,subtract/2]).
-
-%%-compile([export_all]).
-
--export([setup/1,compile_proc/2]).
-
--include("xmerl_internal.hrl").
-
-setup(RE0) ->
- RE = setup(RE0, [$^]),
- Pid = spawn(?MODULE,compile_proc,[self(),RE]),
- receive
- {ok,Result} ->
- Result
- after 2000 ->
- exit(Pid,force),
- parse(RE)
- end.
- %% compile(RE).
-%%RE.
-compile_proc(From,RE) ->
- Res = compile(RE),
- From ! {ok,Res}.
-
-
-setup([$\\,$d|S],Acc) -> setup(S,"]9-0[" ++Acc);
-setup([$\\,$D|S],Acc) -> setup(S,"]9-0^[" ++Acc);
-setup([$\\,$s|S],Acc) -> setup(S,"]s\\t\\n\\r\\[" ++Acc);
-setup([$\\,$S|S],Acc) -> setup(S,"]\\s\\t\\n\\r^[" ++Acc);
-setup([$\\,$i|S],Acc) -> setup(S,"]z-aZ-A_:[" ++Acc); %% Only Latin-1 now
-setup([$\\,$I|S],Acc) -> setup(S,"]z-aZ-A_:^[" ++Acc);
-setup([$\\,$c|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-[" ++Acc);
-setup([$\\,$C|S],Acc) -> setup(S,"]9-0z-aZ-A_:."++[183]++"-^[" ++Acc);
-%% fixme setup([$\\,$w|S]) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup([$\\,$W|S]) -> {{comp_class,"\s\t\n\r"},S};
-%% Letter, Any
-%% fixme setup(["\\p{L}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{L}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Letter, Uppercase
-%% fixme setup(["\\p{Lu}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Lu}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Letter, Lowercase
-%% fixme setup(["\\p{Ll}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Ll}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Letter, Titlecase
-%% fixme setup(["\\p{Lt}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Lt}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Letter, Modifier
-%% fixme setup(["\\p{Lm}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Lm}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Letter, Other
-%% fixme setup(["\\p{Lo}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Lo}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Mark, Any
-%% fixme setup(["\\p{M}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{M}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Mark, Nonspacing
-%% fixme setup(["\\p{Mn}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Mn}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Mark, Spacing Combining
-%% fixme setup(["\\p{Mc}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Mc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Mark, Enclosing
-%% fixme setup(["\\p{Me}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Me}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Number, Any
-%% fixme setup(["\\p{N}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{N}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Number, Decimal Digit
-%% fixme setup(["\\p{Nd}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Nd}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Number, Letter
-%% fixme setup(["\\p{Nl}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Nl}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Number, Other
-%% fixme setup(["\\p{No}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{No}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Punctuation, Any
-%% fixme setup(["\\p{P}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{P}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Punctuation, Connector
-%% fixme setup(["\\p{Pc}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Pc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Punctuation, Dash
-%% fixme setup(["\\p{Pd}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Pd}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Punctuation, Open
-%% fixme setup(["\\p{Ps}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Ps}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Punctuation, Close
-%% fixme setup(["\\p{Pe}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Pe}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Punctuation, Initial quote (may behave like Ps or Pe, depending on usage)
-%% fixme setup(["\\p{Pi}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Pi}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Punctuation, Final quote (may behave like Ps or Pe, depending on usage)
-%% fixme setup(["\\p{Pf}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Pf}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Punctuation, Other
-%% fixme setup(["\\p{Po}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Po}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Symbol, Any
-%% fixme setup(["\\p{S}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{S}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Symbol, Math
-%% fixme setup(["\\p{Sm}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Sm}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Symbol, Currency
-%% fixme setup(["\\p{Sc}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Sc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Symbol, Modifier
-%% fixme setup(["\\p{Sk}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Sk}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Symbol, Other
-%% fixme setup(["\\p{So}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{So}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Separator, Any
-%% fixme setup(["\\p{Z}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Z}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Separator, Space
-%% fixme setup(["\\p{Zs}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Zs}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Separator, Line
-%% fixme setup(["\\p{Zl}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Zl}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Separator, Paragraph
-%% fixme setup(["\\p{Zp}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Zp}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Other, Any
-%% fixme setup(["\\p{C}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{C}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Other, Control
-%% fixme setup(["\\p{Cc}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Cc}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Other, Format
-%% fixme setup(["\\p{Cf}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Cf}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Other, Surrogate not supported by schema recommendation
-%% fixme setup(["\\p{Cs}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Cs}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Other, Private Use
-%% fixme setup(["\\p{Co}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Co}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-%% Other, Not assigned (no characters in the file have this property)
-%% fixme setup(["\\p{Cn}" ++ S) -> {{char_class,"\s\t\n\r"},S};
-%% fixme setup(["\\P{Cn}" ++ S) -> {{comp_class,"\s\t\n\r"},S};
-setup([A|S], Acc) -> setup(S, [A|Acc]);
-setup([],Acc) -> reverse([$$|Acc]).
-
-%% sh_to_awk(ShellRegExp)
-%% Convert a sh style regexp into a full AWK one. The main difficulty is
-%% getting character sets right as the conventions are different.
-
-sh_to_awk(Sh) -> "^(" ++ sh_to_awk_1(Sh). %Fix the beginning
-
-sh_to_awk_1([$*|Sh]) -> %This matches any string
- ".*" ++ sh_to_awk_1(Sh);
-sh_to_awk_1([$?|Sh]) -> %This matches any character
- [$.|sh_to_awk_1(Sh)];
-sh_to_awk_1([$[,$^,$]|Sh]) -> %This takes careful handling
- "\\^" ++ sh_to_awk_1(Sh);
-%% Must move '^' to end.
-sh_to_awk_1("[^" ++ Sh) -> [$[|sh_to_awk_2(Sh, true)];
-sh_to_awk_1("[!" ++ Sh) -> "[^" ++ sh_to_awk_2(Sh, false);
-sh_to_awk_1([$[|Sh]) -> [$[|sh_to_awk_2(Sh, false)];
-sh_to_awk_1([C|Sh]) ->
- %% Unspecialise everything else which is not an escape character.
- case sh_special_char(C) of
- true -> [$\\,C|sh_to_awk_1(Sh)];
- false -> [C|sh_to_awk_1(Sh)]
- end;
-sh_to_awk_1([]) -> ")$". %Fix the end
-
-sh_to_awk_2([$]|Sh], UpArrow) -> [$]|sh_to_awk_3(Sh, UpArrow)];
-sh_to_awk_2(Sh, UpArrow) -> sh_to_awk_3(Sh, UpArrow).
-
-sh_to_awk_3([$]|Sh], true) -> "^]" ++ sh_to_awk_1(Sh);
-sh_to_awk_3([$]|Sh], false) -> [$]|sh_to_awk_1(Sh)];
-sh_to_awk_3([C|Sh], UpArrow) -> [C|sh_to_awk_3(Sh, UpArrow)];
-sh_to_awk_3([], true) -> [$^|sh_to_awk_1([])];
-sh_to_awk_3([], false) -> sh_to_awk_1([]).
-
-%% -type sh_special_char(char()) -> bool().
-%% Test if a character is a special character.
-
-sh_special_char($|) -> true;
-sh_special_char($*) -> true;
-sh_special_char($+) -> true;
-sh_special_char($?) -> true;
-sh_special_char($() -> true;
-sh_special_char($)) -> true;
-sh_special_char($\\) -> true;
-sh_special_char($^) -> true;
-sh_special_char($$) -> true;
-sh_special_char($.) -> true;
-sh_special_char($[) -> true;
-sh_special_char($]) -> true;
-sh_special_char($") -> true;
-sh_special_char(_C) -> false.
-
-%% parse(RegExp) -> {ok,RE} | {error,E}.
-%% Parse the regexp described in the string RegExp.
-
-parse(S) ->
- try reg(S, 0) of
- {R,Sc,[]} ->
- {ok,{regexp,{R,Sc}}};
- {_R,_Sc,[C|_]} ->
- {error,{illegal,[C]}}
- catch
- throw:{error,E} -> {error,E}
- end.
-
-%% format_error(Error) -> String.
-
-format_error({interval_range,What}) ->
- ["illegal interval range",io_lib:write_string(What)];
-format_error({illegal,What}) -> ["illegal character `",What,"'"];
-format_error({unterminated,What}) -> ["unterminated `",What,"'"];
-format_error({posix_cc,What}) ->
- ["illegal POSIX character class ",io_lib:write_string(What)];
-format_error({char_class,What}) ->
- ["illegal character class ",io_lib:write_string(What)].
-
-%% match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}.
-%% Find the longest match of RegExp in String.
-
-match(S, RegExp) when is_list(RegExp) ->
- case parse(RegExp) of
- {ok,RE} -> match(S, RE);
- {error,E} -> {error,E}
- end;
-match(S, {regexp,RE}) ->
- case match_re(RE, S, 1, 0, -1) of
- {Start,Len} when Len >= 0 ->
- {match,Start,Len};
- {_Start,_Len} -> nomatch
- end;
-match(S, {comp_regexp,RE}) ->
- case match_comp(RE, S, 1, 0, -1) of
- {Start,Len} when Len >= 0 ->
- {match,Start,Len};
- {_Start,_Len} -> nomatch
- end.
-
-match_re(RE, [_|Cs]=S0, P0, Mst, Mlen) ->
- case re_apply(S0, P0, RE) of
- {match,P1,_S1,_Subs} ->
- Len = P1-P0,
- if Len > Mlen -> match_re(RE, Cs, P0+1, P0, Len);
- true -> match_re(RE, Cs, P0+1, Mst, Mlen)
- end;
- nomatch -> match_re(RE, Cs, P0+1, Mst, Mlen);
- never_match -> {Mst,Mlen} %No need to go on
- end;
-match_re(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}.
-
-match_comp(RE, [_|Cs]=S0, P0, Mst, Mlen) ->
- case comp_apply(S0, P0, RE) of
- {match,P1,_S1} ->
- Len = P1-P0,
- if Len > Mlen -> match_comp(RE, Cs, P0+1, P0, Len);
- true -> match_comp(RE, Cs, P0+1, Mst, Mlen)
- end;
- nomatch -> match_comp(RE, Cs, P0+1, Mst, Mlen)
- end;
-match_comp(_RE, _S, _P, Mst, Mlen) -> {Mst,Mlen}.
-
-%% match_re(RE, S0, Pos0, Mst, Mlen) ->
-%% case first_match_re(RE, S0, Pos0) of
-%% {St,Len,_} -> %Found a match
-%% Pos1 = St + 1, %Where to start next match
-%% S1 = lists:nthtail(Pos1-Pos0, S0),
-%% if Len > Mlen -> match_re(RE, S1, Pos1, St, Len);
-%% true -> match_re(RE, S1, Pos1, Mst, Mlen)
-%% end;
-%% nomatch -> {Mst,Mlen}
-%% end.
-
-%% match_comp(RE, S0, Pos0, Mst, Mlen) ->
-%% case first_match_comp(RE, S0, Pos0) of
-%% {St,Len} -> %Found a match
-%% Pos1 = St + 1, %Where to start next match
-%% S1 = lists:nthtail(Pos1-Pos0, S0),
-%% if Len > Mlen -> match_comp(RE, S1, Pos1, St, Len);
-%% true -> match_comp(RE, S1, Pos1, Mst, Mlen)
-%% end;
-%% nomatch -> {Mst,Mlen}
-%% end.
-
-%% first_match(String, RegExp) -> {match,Start,Length} | nomatch | {error,E}.
-%% Find the first match of RegExp in String.
-
-first_match(S, RegExp) when is_list(RegExp) ->
- case parse(RegExp) of
- {ok,RE} -> first_match(S, RE);
- {error,E} -> {error,E}
- end;
-first_match(S, {regexp,RE}) ->
- case first_match_re(RE, S, 1) of
- {Start,Len,_} -> {match,Start,Len};
- nomatch -> nomatch
- end;
-first_match(S, {comp_regexp,RE}) ->
- case first_match_comp(RE, S, 1) of
- {Start,Len} -> {match,Start,Len};
- nomatch -> nomatch
- end.
-
-first_match_re(RE, S, St) when S /= [] ->
- case re_apply(S, St, RE) of
- {match,P,_Rest,Subs} -> {St,P-St,Subs};
- nomatch -> first_match_re(RE, tl(S), St+1);
- never_match -> nomatch
- end;
-first_match_re(_RE, [], _St) -> nomatch.
-
-first_match_comp(RE, S, St) when S /= [] ->
- case comp_apply(S, St, RE) of
- {match,P,_Rest} -> {St,P-St};
- nomatch -> first_match_comp(RE, tl(S), St+1)
- end;
-first_match_comp(_RE, [], _St) -> nomatch.
-
-%% matches(String, RegExp) -> {match,[{Start,Length}]} | {error,E}.
-%% Return the all the non-overlapping matches of RegExp in String.
-
-matches(S, RegExp) when is_list(RegExp) ->
- case parse(RegExp) of
- {ok,RE} -> matches(S, RE);
- {error,E} -> {error,E}
- end;
-matches(S, {regexp,RE}) -> {match,matches_re(S, RE, 1)};
-matches(S, {comp_regexp,RE}) -> {match,matches_comp(S, RE, 1)}.
-
-matches_re([_|Cs]=S0, RE, P0) ->
- case re_apply(S0, P0, RE) of
- {match,P0,S1,_Subs} -> %0 length match
- [{P0,0}|matches_re(tl(S1), RE, P0+1)];
- {match,P1,S1,_Subs} ->
- [{P0,P1-P0}|matches_re(S1, RE, P1)];
- nomatch -> matches_re(Cs, RE, P0+1);
- never_match -> []
- end;
-matches_re([], _RE, _P) -> [].
-
-matches_comp([_|Cs]=S0, RE, P0) ->
- case comp_apply(S0, P0, RE) of
- {match,P0,S1} -> %0 length match
- [{P0,0}|matches_comp(tl(S1), RE, P0+1)];
- {match,P1,S1} ->
- [{P0,P1-P0}|matches_comp(S1, RE, P1)];
- nomatch -> matches_comp(Cs, RE, P0+1)
- end;
-matches_comp([], _RE, _P) -> [].
-
-%% sub(String, RegExp, Replace) -> {ok,RepString,RepCount} | {error,E}.
-%% Substitute the first match of the regular expression RegExp with
-%% the string Replace in String. Accept pre-parsed regular
-%% expressions.
-
-sub(String, RegExp, Rep) when is_list(RegExp) ->
- case parse(RegExp) of
- {ok,RE} -> sub(String, RE, Rep);
- {error,E} -> {error,E}
- end;
-sub(String, {regexp,RE}, Rep) ->
- case sub_re(String, 1, RE, [], Rep) of
- {yes,NewStr} -> {ok,NewStr,1};
- no -> {ok,String,0}
- end;
-sub(String, {comp_regexp,RE}, Rep) ->
- case sub_comp(String, 1, RE, [], Rep) of
- {yes,NewStr} -> {ok,NewStr,1};
- no -> {ok,String,0}
- end.
-
-%% sub_re(String, Position, Regexp, Before, Replacement) ->
-%% {NewString,Count}.
-%% sub_comp(String, Position, Regexp, Before, Replacement) ->
-%% {NewString,Count}.
-%% Step forward over String until a match is found saving stepped over
-%% chars in Before. Return reversed Before prepended to replacement
-%% and rest of string.
-
-sub_re([C|Cs]=S0, P0, RE, Bef, Rep) ->
- case re_apply(S0, P0, RE) of
- {match,P0,_S1,_} -> %Ignore 0 length match
- sub_re(Cs, P0+1, RE, [C|Bef], Rep);
- {match,P1,Rest,_Gps} ->
- {yes,reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), Rest))};
- nomatch -> sub_re(Cs, P0+1, RE, [C|Bef], Rep);
- never_match -> no %No need to go on
- end;
-sub_re([], _P, _RE, _Bef, _Rep) -> no.
-
-sub_comp([C|Cs]=S0, P0, RE, Bef, Rep) ->
- case comp_apply(S0, P0, RE) of
- {match,P0,_S1} -> %Ignore 0 length match
- sub_comp(Cs, P0+1, RE, [C|Bef], Rep);
- {match,P1,Rest} ->
- {yes,reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), Rest))};
- nomatch -> sub_comp(Cs, P0+1, RE, [C|Bef], Rep)
- end;
-sub_comp([], _P, _RE, _Bef, _Rep) -> no.
-
-sub_repl([$&|Rep], M, Rest) -> M ++ sub_repl(Rep, M, Rest);
-sub_repl("\\&" ++ Rep, M, Rest) -> [$&|sub_repl(Rep, M, Rest)];
-sub_repl([C|Rep], M, Rest) -> [C|sub_repl(Rep, M, Rest)];
-sub_repl([], _M, Rest) -> Rest.
-
-%% gsub(String, RegExp, Replace) -> {ok,RepString,RepCount} | {error,E}.
-%% Substitute every match of the regular expression RegExp with the
-%% string New in String. Accept pre-parsed regular expressions.
-
-gsub(String, RegExp, Rep) when is_list(RegExp) ->
- case parse(RegExp) of
- {ok,RE} -> gsub(String, RE, Rep);
- {error,E} -> {error,E}
- end;
-gsub(String, {regexp,RE}, Rep) ->
- case gsub_re(String, 1, RE, [], Rep) of
- {NewStr,N} -> {ok,NewStr,N};
- no -> {ok,String,0} %No substitutions
- end;
-gsub(String, {comp_regexp,RE}, Rep) ->
- case gsub_comp(String, 1, RE, [], Rep) of
- {NewStr,N} -> {ok,NewStr,N};
- no -> {ok,String,0} %No substitutions
- end.
-
-%% gsub_re(String, Position, Regexp, Before, Replacement) ->
-%% {NewString,Count}.
-%% gsub_comp(String, Position, Regexp, Before, Replacement) ->
-%% {NewString,Count}.
-%% Step forward over String until a match is found saving stepped over
-%% chars in Before. Call recursively to do rest of string after
-%% match. Return reversed Before prepended to return from recursive
-%% call.
-
-gsub_re([C|Cs]=S0, P0, RE, Bef, Rep) ->
- case re_apply(S0, P0, RE) of
- {match,P0,_S1,_} -> %Ignore 0 length match
- gsub_re(Cs, P0+1, RE, [C|Bef], Rep);
- {match,P1,S1,_Gps} ->
- case gsub_re(S1, P1, RE, [], Rep) of
- {NewStr,N0} -> %Substituitions
- {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), NewStr)),
- N0+1};
- no -> %No substituitions.
- {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), S1)),1}
- end;
- %%No match so step forward saving C on Bef.
- nomatch -> gsub_re(Cs, P0+1, RE, [C|Bef], Rep);
- never_match -> no %No need to go on
- end;
-gsub_re([], _P, _RE, _Bef, _Rep) -> no.
-
-gsub_comp([C|Cs]=S0, P0, RE, Bef, Rep) ->
- case comp_apply(S0, P0, RE) of
- {match,P0,_S1} -> %Ignore 0 length match
- gsub_comp(Cs, P0+1, RE, [C|Bef], Rep);
- {match,P1,S1} ->
- case gsub_comp(S1, P1, RE, [], Rep) of
- {NewStr,N0} -> %Substituitions
- {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), NewStr)),
- N0+1};
- no -> %No substituitions.
- {reverse(Bef, sub_repl(Rep, substr(S0, 1, P1-P0), S1)),1}
- end;
- %%No match so step forward saving C on Bef.
- nomatch -> gsub_comp(Cs, P0+1, RE, [C|Bef], Rep)
- end;
-gsub_comp([], _P, _RE, _Bef, _Rep) -> no.
-
-%% split(String, RegExp) -> {ok,[SubString]} | {error,E}.
-%% Split a string into substrings where the RegExp describes the
-%% field separator. The RegExp " " is specially treated.
-
-split(String, " ") -> %This is really special
- {ok,{regexp,RE}} = parse("[ \t]+"),
- case split_apply_re(String, RE, true) of
- [[]|Ss] -> {ok,Ss};
- Ss -> {ok,Ss}
- end;
-split(String, RegExp) when is_list(RegExp) ->
- case parse(RegExp) of
- {ok,{regexp,RE}} -> {ok,split_apply_re(String, RE, false)};
- {error,E} -> {error,E}
- end;
-split(String, {regexp,RE}) -> {ok,split_apply_re(String, RE, false)};
-split(String, {comp_regexp,RE}) -> {ok,split_apply_comp(String, RE, false)}.
-
-split_apply_re(S, RE, Trim) -> split_apply_re(S, 1, RE, Trim, []).
-
-split_apply_re([], _P, _RE, true, []) -> [];
-split_apply_re([], _P, _RE, _T, Sub) -> [reverse(Sub)];
-split_apply_re([C|Cs]=S, P0, RE, T, Sub) ->
- case re_apply(S, P0, RE) of
- {match,P0,_S1,_} -> %Ignore 0 length match
- split_apply_re(Cs, P0+1, RE, T, [C|Sub]);
- {match,P1,S1,_} ->
- [reverse(Sub)|split_apply_re(S1, P1, RE, T, [])];
- nomatch ->
- split_apply_re(Cs, P0+1, RE, T, [C|Sub]);
- never_match -> [reverse(Sub, S)] %No need to go on
- end.
-
-split_apply_comp(S, RE, Trim) -> split_apply_comp(S, 1, RE, Trim, []).
-
-%%split_apply_comp([], _P, _RE, true, []) -> [];
-split_apply_comp([], _P, _RE, _T, Sub) -> [reverse(Sub)];
-split_apply_comp([C|Cs]=S, P0, RE, T, Sub) ->
- case comp_apply(S, P0, RE) of
- {match,P0,_S1} -> %Ignore 0 length match
- split_apply_comp(Cs, P0+1, RE, T, [C|Sub]);
- {match,P1,S1} ->
- [reverse(Sub)|split_apply_comp(S1, P1, RE, T, [])];
- nomatch ->
- split_apply_comp(Cs, P0+1, RE, T, [C|Sub])
- end.
-
-%% sub_match(String, RegExp) ->
-%% {match,Start,Length,SubExprs} | nomatch | {error,E}.
-%% Find the longest match of RegExp in String.
-
-sub_match(S, RegExp) when is_list(RegExp) ->
- case parse(RegExp) of
- {ok,RE} -> sub_match(S, RE);
- {error,E} -> {error,E}
- end;
-sub_match(S, {regexp,RE}) ->
- case sub_match_re(RE, S, 1, 0, -1, none) of
- {Start,Len,Subs} when Len >= 0 ->
- {match,Start,Len,Subs};
- {_Start,_Len,_Subs} -> nomatch
- end.
-
-sub_match_re(RE, S0, Pos0, Mst, Mlen, Msubs) ->
- case first_match_re(RE, S0, Pos0) of
- {St,Len,Subs} -> %Found a match
- Pos1 = St + 1, %Where to start next match
- S1 = lists:nthtail(Pos1-Pos0, S0),
- if Len > Mlen -> sub_match_re(RE, S1, Pos1, St, Len, Subs);
- true -> sub_match_re(RE, S1, Pos1, Mst, Mlen, Msubs)
- end;
- nomatch -> {Mst,Mlen,Msubs}
- end.
-
-%% sub_first_match(String, RegExp) ->
-%% {match,Start,Length,SubExprs} | nomatch | {error,E}.
-%% Find the longest match of RegExp in String, return Start and Length
-%% as well as tuple of sub-expression matches.
-
-sub_first_match(S, RegExp) when is_list(RegExp) ->
- {ok,RE} = parse(RegExp),
- sub_first_match(S, RE);
-sub_first_match(S, {regexp,RE}) ->
- case first_match_re(RE, S, 1) of
- {St,Len,Subs} -> {match,St,Len,Subs};
- nomatch -> nomatch
- end.
-
-
-%% This is the regular expression grammar used. It is equivalent to the
-%% one used in AWK, except that we allow ^ $ to be used anywhere and fail
-%% in the matching.
-%%
-%% reg -> reg1 : '$1'.
-%% reg1 -> reg1 "|" reg2 : {'or','$1','$2'}.
-%% reg1 -> reg2 : '$1'.
-%% reg2 -> reg2 reg3 : {concat,'$1','$2'}.
-%% reg2 -> reg3 : '$1'.
-%% reg3 -> reg3 "*" : {kclosure,'$1'}.
-%% reg3 -> reg3 "+" : {pclosure,'$1'}.
-%% reg3 -> reg3 "?" : {optional,'$1'}.
-%% reg3 -> reg3 "{" [Min],[Max] "}" : {closure_range, Num, '$1'} see below
-%% reg3 -> reg4 : '$1'.
-%% reg4 -> "(" reg ")" : '$2'.
-%% reg4 -> "\\" char : '$2'.
-%% reg4 -> "^" : bos.
-%% reg4 -> "$" : eos.
-%% reg4 -> "." : char.
-%% reg4 -> "[" class "]" : {char_class,char_class('$2')}
-%% reg4 -> "[" "^" class "]" : {comp_class,char_class('$3')}
-%% reg4 -> "\"" chars "\"" : char_string('$2')
-%% reg4 -> char : '$1'.
-%% reg4 -> empty : epsilon.
-%% The grammar of the current regular expressions. The actual parser
-%% is a recursive descent implementation of the grammar.
-
-reg(S, Sc) -> reg1(S, Sc).
-
-%% reg1 -> reg2 reg1'
-%% reg1' -> "|" reg2
-%% reg1' -> empty
-
-reg1(S0, Sc0) ->
- {L,Sc1,S1} = reg2(S0, Sc0),
- reg1p(S1, L, Sc1).
-
-reg1p([$||S0], L, Sc0) ->
- {R,Sc1,S1} = reg2(S0, Sc0),
- reg1p(S1, {'or',L,R}, Sc1);
-reg1p(S, L, Sc) -> {L,Sc,S}.
-
-%% reg2 -> reg3 reg2'
-%% reg2' -> reg3
-%% reg2' -> empty
-
-reg2(S0, Sc0) ->
- {L,Sc1,S1} = reg3(S0, Sc0),
- reg2p(S1, L, Sc1).
-
-reg2p([C|S0], L, Sc0) when C /= $|, C /= $) ->
- {R,Sc1,S1} = reg3([C|S0], Sc0),
- %% reg2p(S1, {concat,L,R}, Sc1);
- case is_integer(R) of
- true ->
- case L of
- {literal,Lit} ->
- reg2p(S1, {literal,Lit ++[R]}, Sc1);
- {concat,S2,Char} when is_integer(Char) ->
- reg2p(S1, {concat,S2,{literal,[Char,R]}}, Sc1);
- {concat,S2,{literal,Lit}} ->
- reg2p(S1, {concat,S2,{literal,Lit ++ [R]}}, Sc1);
- Char when is_integer(Char) ->
- reg2p(S1, {literal,[Char,R]}, Sc1);
- _ ->
- reg2p(S1, {concat,L,R}, Sc1)
- end;
- false ->
- reg2p(S1, {concat,L,R}, Sc1)
- end;
-reg2p(S, L, Sc) -> {L,Sc,S}.
-
-%% reg3 -> reg4 reg3'
-%% reg3' -> "*" reg3'
-%% reg3' -> "+" reg3'
-%% reg3' -> "?" reg3'
-%% reg3' -> "{" [Min],[Max] "}" reg3'
-%% reg3' -> empty
-
-reg3(S0, Sc0) ->
- {L,Sc1,S1} = reg4(S0, Sc0),
- reg3p(S1, L, Sc1).
-
-reg3p([$*|S], L, Sc) -> reg3p(S, {kclosure,L}, Sc);
-reg3p([$+|S], L, Sc) -> reg3p(S, {pclosure,L}, Sc);
-reg3p([$?|S], L, Sc) -> reg3p(S, {optional,L}, Sc);
-reg3p([${|Cs0], L, Sc) -> % $}
- case interval_range(Cs0) of
- {none,none,_Cs1} -> parse_error({interval_range,[${|Cs0]});
- {N,M,[$}|Cs1]} -> reg3p(Cs1, {iclosure,L,N,M}, Sc);
- {_N,_M,_Cs1} -> parse_error({unterminated,"{"})
- end;
-reg3p(S, L, Sc) -> {L,Sc,S}.
-
-reg4([$(|S0], Sc0) ->
- Sc1 = Sc0+1,
- case reg(S0, Sc1) of
- {R,Sc2,[$)|S1]} -> {{subexpr,Sc1,R},Sc2,S1};
- {_R,_Sc,_S} -> parse_error({unterminated,"("})
- end;
-reg4([$^|S], Sc) -> {bos,Sc,S};
-reg4([$$|S], Sc) -> {eos,Sc,S};
-reg4([$.|S], Sc) -> {{comp_class,"\n"},Sc,S};
-reg4("[^" ++ S0, Sc) ->
- case char_class(S0) of
- {Cc,[$]|S1]} -> {{comp_class,Cc},Sc,S1};
- {_Cc,_S} -> parse_error({unterminated,"["})
- end;
-reg4([$[|S0], Sc) ->
- case char_class(S0) of
- {Cc,[$]|S1]} -> {{char_class,Cc},Sc,S1};
- {_Cc,_S1} -> parse_error({unterminated,"["})
- end;
-%reg4([$"|S0], Sc) ->
-% case char_string(S0) of
-% {St,[$"|S1]} -> {St,Sc,S1};
-% {St,S1} -> parse_error({unterminated,"\""})
-% end;
-reg4([C0|S0], Sc) when
- is_integer(C0), C0 /= $*, C0 /= $+, C0 /= $?, C0 /= $], C0 /= $), C0 /= $} ->
- %% Handle \ quoted characters as well, at least those we see.
- {C1,S1} = char(C0, S0),
- {C1,Sc,S1};
-reg4(S=[$)|_], Sc) -> {epsilon,Sc,S};
-reg4([C|_S], _Sc) -> parse_error({illegal,[C]});
-reg4([], Sc) -> {epsilon,Sc,[]}.
-
-char($\\, [O1,O2,O3|S]) when
- O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
- {(O1*8 + O2)*8 + O3 - 73*$0,S};
-char($\\, [C|S]) -> {escape_char(C),S};
-char($\\, []) -> parse_error({unterminated,"\\"});
-char(C, S) -> {C,S}.
-
-escape_char($n) -> $\n; %\n = LF
-escape_char($r) -> $\r; %\r = CR
-escape_char($t) -> $\t; %\t = TAB
-escape_char($v) -> $\v; %\v = VT
-escape_char($b) -> $\b; %\b = BS
-escape_char($f) -> $\f; %\f = FF
-escape_char($e) -> $\e; %\e = ESC
-escape_char($s) -> $\s; %\s = SPACE
-escape_char($d) -> $\d; %\d = DEL
-escape_char(C) -> C.
-
-char_class([$]|S0]) ->
- {Cc,S1} = char_class(S0, [$]]),
- {pack_cc(Cc),S1};
-char_class(S0) ->
- {Cc,S1} = char_class(S0, []),
- {pack_cc(Cc),S1}.
-
-pack_cc(Cc0) ->
- %% First sort the list.
- Cc1 = lists:usort(fun ({Cf1,_}, {Cf2,_}) -> Cf1 < Cf2;
- ({Cf1,_}, C) -> Cf1 < C;
- (C, {Cf,_}) -> C < Cf;
- (C1, C2) -> C1 =< C2
- end, Cc0),
- pack_cc1(Cc1).
-
-pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 =< Cl2 ->
- pack_cc1([{Cf1,Cl2}|Cc]);
-pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1 >= Cf2, Cl1 >= Cl2 ->
- pack_cc1([{Cf1,Cl1}|Cc]);
-pack_cc1([{Cf1,Cl1},{Cf2,Cl2}|Cc]) when Cl1+1 == Cf2 ->
- pack_cc1([{Cf1,Cl2}|Cc]);
-pack_cc1([{Cf,Cl},C|Cc]) when Cl >= C -> pack_cc1([{Cf,Cl}|Cc]);
-pack_cc1([{Cf,Cl},C|Cc]) when Cl+1 == C -> pack_cc1([{Cf,C}|Cc]);
-pack_cc1([C,{Cf,Cl}|Cc]) when C == Cf-1 -> pack_cc1([{C,Cl}|Cc]);
-pack_cc1([C1,C2|Cc]) when C1+1 == C2 -> pack_cc1([{C1,C2}|Cc]);
-pack_cc1([C|Cc]) -> [C|pack_cc1(Cc)];
-pack_cc1([]) -> [].
-
-char_class("[:" ++ S0, Cc0) -> %Start of POSIX char class
- case posix_cc(S0, Cc0) of
- {Cc1,":]" ++ S1} -> char_class(S1, Cc1);
- {_,_S1} -> parse_error({posix_cc,"[:" ++ S0})
- end;
-char_class([C1|S0], Cc) when C1 /= $] ->
- case char(C1, S0) of
- {Cf,[$-,C2|S1]} when C2 /= $] ->
- case char(C2, S1) of
- {Cl,S2} when Cf < Cl -> char_class(S2, [{Cf,Cl}|Cc]);
- {_Cl,_S2} -> parse_error({char_class,[C1|S0]})
- end;
- {C,S1} -> char_class(S1, [C|Cc])
- end;
-char_class(S, Cc) -> {Cc,S}.
-
-%% posix_cc(String, CharClass) -> {NewCharClass,RestString}.
-%% Handle POSIX character classes, use Latin-1 character set.
-
-posix_cc("alnum" ++ S, Cc) ->
- {[{$0,$9},{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S};
-posix_cc("alpha" ++ S, Cc) ->
- {[{$A,$Z},{192,214},{216,223},{$a,$z},{224,246},{248,255}|Cc],S};
-posix_cc("blank" ++ S, Cc) -> {[$\s,$\t,160|Cc],S};
-posix_cc("cntrl" ++ S, Cc) -> {[{0,31},{127,159}|Cc],S};
-posix_cc("digit" ++ S, Cc) -> {[{$0,$9}|Cc],S};
-posix_cc("graph" ++ S, Cc) -> {[{33,126},{161,255}|Cc],S};
-posix_cc("lower" ++ S, Cc) -> {[{$a,$z},{224,246},{248,255}|Cc],S};
-posix_cc("print" ++ S, Cc) -> {[{32,126},{160,255}|Cc],S};
-posix_cc("punct" ++ S, Cc) -> {[{$!,$/},{$:,$?},{${,$~},{161,191}|Cc],S};
-posix_cc("space" ++ S, Cc) -> {[$\s,$\t,$\f,$\r,$\v,160|Cc],S};
-posix_cc("upper" ++ S, Cc) -> {[{$A,$Z},{192,214},{216,223}|Cc],S};
-posix_cc("xdigit" ++ S, Cc) -> {[{$a,$f},{$A,$F},{$0,$9}|Cc],S};
-posix_cc(S, _Cc) -> parse_error({posix_cc,"[:" ++ S}).
-
-interval_range(Cs0) ->
- case number(Cs0) of
- {none,Cs1} -> {none,none,Cs1};
- {N,[$,|Cs1]} ->
- case number(Cs1) of
- {none,Cs2} -> {N,any,Cs2};
- {M,Cs2} -> {N,M,Cs2}
- end;
- {N,Cs1} -> {N,none,Cs1}
- end.
-
-number([C|Cs]) when C >= $0, C =< $9 ->
- number(Cs, C - $0);
-number(Cs) -> {none,Cs}.
-
-number([C|Cs], Acc) when C >= $0, C =< $9 ->
- number(Cs, 10*Acc + (C - $0));
-number(Cs, Acc) -> {Acc,Cs}.
-
-parse_error(E) -> throw({error,E}).
-
-%char_string([C|S]) when C /= $" -> char_string(S, C);
-%char_string(S) -> {epsilon,S}.
-
-%char_string([C|S0], L) when C /= $" ->
-% char_string(S0, {concat,L,C});
-%char_string(S, L) -> {L,S}.
-
-%% re_apply(String, StartPos, RegExp) ->
-%% {match,RestPos,Rest,SubExprs} | nomatch.
-%%
-%% Apply the (parse of the) regular expression RegExp to String. If
-%% there is a match return the position of the remaining string and
-%% the string if else return 'nomatch'.
-%%
-%% StartPos should be the real start position as it is used to decide
-%% if we are at the beginning of the string.
-
-re_apply(S, St, {RE,Sc}) ->
- Subs = erlang:make_tuple(Sc, none), %Make a sub-regexp table.
- Res = re_apply(RE, [], S, St, Subs),
- %% ?dbg("~p x ~p -> ~p\n", [RE,S,Res]),
- Res.
-
-re_apply(epsilon, More, S, P, Subs) -> %This always matches
- re_apply_more(More, S, P, Subs);
-re_apply({'or',RE1,RE2}, More, S, P, Subs) ->
- re_apply_or(re_apply(RE1, More, S, P, Subs),
- re_apply(RE2, More, S, P, Subs));
-re_apply({concat,RE1,RE2}, More, S0, P, Subs) ->
- re_apply(RE1, [RE2|More], S0, P, Subs);
-re_apply({literal,[C|Lcs]}, More, [C|S], P, Subs) ->
- re_apply_lit(Lcs, More, S, P+1, Subs); %Have matched first char
-re_apply({kclosure,RE}, More, S0, P0, Subs0) ->
- %% Greedy so try RE first, no difference here actually.
- Loop = case re_apply(RE, [], S0, P0, Subs0) of
- {match,P0,_S1,_Subs1} -> %0 length match, don't loop!
- nomatch;
- {match,P1,S1,Subs1} ->
- re_apply_more([{kclosure,RE}|More], S1, P1, Subs1);
- nomatch -> nomatch;
- never_match -> never_match
- end,
- re_apply_or(Loop, re_apply_more(More, S0, P0, Subs0));
-re_apply({pclosure,RE}, More, S, P, Subs) ->
- re_apply(RE, [{kclosure,RE}|More], S, P, Subs);
-re_apply({optional,RE}, More, S, P, Subs) ->
- %% Greedy so try RE first, no difference here actually.
- re_apply_or(re_apply(RE, More, S, P, Subs),
- re_apply_more(More, S, P, Subs));
-re_apply({iclosure,RE,N,M}, More, S, P, Subs) when N > 0 ->
- re_apply(RE, [{iclosure,RE,N-1,M}|More], S, P, Subs);
-re_apply({iclosure,RE,0,M}, More, S, P, Subs) ->
- Exp = expand_opt(RE, M),
- re_apply(Exp, More, S, P, Subs);
-re_apply({subexpr,N,RE}, More, S, P, Subs) ->
- re_apply(RE, [{endsub,N,P}|More], S, P, Subs);
-re_apply({endsub,N,St}, More, S, P, Subs0) ->
- Subs1 = setelement(N, Subs0, {St,P-St}), %Record sub-expr
- re_apply_more(More, S, P, Subs1);
-re_apply(bos, More, S, 1, Subs) -> re_apply_more(More, S, 1, Subs);
-re_apply(bos, _More, _S, _, _) -> never_match;
-re_apply(eos, More, [$\n], P, Subs) -> re_apply_more(More, [], P, Subs);
-re_apply(eos, More, [], P, Subs) -> re_apply_more(More, [], P, Subs);
-re_apply({char_class,Cc}, More, [C|S], P, Subs) ->
- case in_char_class(C, Cc) of
- true -> re_apply_more(More, S, P+1, Subs);
- false -> nomatch
- end;
-re_apply({comp_class,Cc}, More, [C|S], P, Subs) ->
- case in_char_class(C, Cc) of
- true -> nomatch;
- false -> re_apply_more(More, S, P+1, Subs)
- end;
-re_apply(C, More, [C|S], P, Subs) when is_integer(C) ->
- re_apply_more(More, S, P+1, Subs);
-re_apply(_RE, _More, _S, _P, _Subs) ->
- %% ?dbg("~p : ~p\n", [_RE,_S]),
- nomatch.
-
-%% re_apply_more([RegExp], String, Length, SubsExprs) ->
-%% {match,RestPos,Rest,SubExprs} | nomatch.
-
-re_apply_more([RE|More], S, P, Subs) -> re_apply(RE, More, S, P, Subs);
-re_apply_more([], S, P, Subs) -> {match,P,S,Subs}.
-
-%% re_apply_lit(Literal, More, String, Position, SubExprs) ->
-%% {match,RestPos,Rest,SubExprs} | nomatch.
-re_apply_lit([C|Lit], More, [C|Cs], P, Subs) ->
- re_apply_lit(Lit, More, Cs, P+1, Subs);
-re_apply_lit([], More, Cs, P, Subs) ->
- re_apply_more(More, Cs, P, Subs);
-re_apply_lit(_Lit, _More, _Cs, _P, _Subs) ->
- nomatch.
-
-%% expand_iclosure(RE, N, M) -> RE.
-
-expand_iclosure(RE, 0, M) -> expand_opt(RE, M);
-expand_iclosure(RE, N, M) ->
- {concat,RE,expand_iclosure(RE, N-1, M)}.
-
-%% expand_opt(RegExp, Count) -> RE.
-%% Handle all the cases.
-
-expand_opt(_RE, none) -> epsilon;
-expand_opt(RE, any) -> {kclosure,RE};
-expand_opt(_RE, 0) -> epsilon;
-expand_opt(RE, 1) -> {optional,RE};
-expand_opt(RE, N) ->
- {optional,{concat,RE,expand_opt(RE, N-1)}}.
-
-%% find_prefix(PrefixStr, SourceStr)
-%% if PrefixStr is a prefix of Str then return {ok,RemainingStr}
-%% otherwise return false
-
-%% find_prefix([C|Prest], [C|Rest]) ->
-%% find_prefix(Prest, Rest);
-%% find_prefix([], Rest) -> {yes,Rest};
-%% find_prefix(_, _) -> no.
-
-%% in_char_class(Char, Class) -> bool().
-
-in_char_class(C, [{C1,C2}|_Cc]) when C >= C1, C =< C2 -> true;
-in_char_class(C, [C|_Cc]) -> true;
-in_char_class(C, [_|Cc]) -> in_char_class(C, Cc);
-in_char_class(_C, []) -> false.
-
-%% re_apply_or(Match1, Match2, SubExprs) ->
-%% {match,RestPos,Rest,SubExprs} | nomatch.
-%% If we want the best match then choose the longest match, else just
-%% choose one by trying sequentially.
-
-re_apply_or(M1={match,P1,_,_},{match,P2,_,_}) when P1 >= P2 -> M1;
-re_apply_or({match,_,_,_}, M2={match,_,_,_}) -> M2;
-re_apply_or(never_match, R2) -> R2;
-re_apply_or(R1, never_match) -> R1;
-re_apply_or(nomatch, R2) -> R2;
-re_apply_or(R1, nomatch) -> R1.
-
-%% Record definitions for the NFA, DFA and compiler.
-
--record(nfa_state, {no,edges=[],accept=no}).
--record(dfa_state, {no,nfa=[],trans=[],accept=no}).
-
--record(c_state, {no,trans=[],tmin=0,smin=none,tmax=0,smax=none,
- accept=false,spec=[]}).
-
-%% We use standard methods, Thompson's construction and subset
-%% construction, to create first an NFA and then a DFA from the
-%% regexps. A non-standard feature is that we work with sets of
-%% character ranges (crs) instead sets of characters. This is most
-%% noticeable when constructing DFAs. The major benefit is that we can
-%% handle characters from any set, not just limited ASCII or 8859,
-%% even 16/32 bit unicode.
-%%
-%% The whole range of characters is 0-maxchar, where maxchar is a BIG
-%% number. We don't make any assumptions about the size of maxchar, it
-%% is just bigger than any character.
-%%
-%% Using character ranges makes describing many regexps very simple,
-%% for example the regexp "." just becomes the range
-%% [{0-9},{11-maxchar}].
-
-%% make_nfa(RegExpActions) -> {ok,{NFA,StartState}} | {error,E}.
-%% Build a complete nfa from a list of {RegExp,Action}. The NFA field
-%% accept has values {yes,Action}|no. The NFA is a list of states.
-
-make_nfa(REAs0) ->
- case parse_reas(REAs0) of
- {ok,REAs1} ->
- {NFA,Start} = build_combined_nfa(REAs1),
- {ok,{NFA,Start}};
- {error,E} -> {error,E}
- end.
-
-%% make_dfa(RegExpActions) -> {ok,{DFA,StartState}} | {error,E}.
-%% make_dfa(RegExpActions, LowestState) -> {ok,{DFA,StartState}} | {error,E}.
-%% Build a complete dfa from a list of {RegExp,Action}. The DFA field
-%% accept has values {yes,Action}|no. If multiple Regexps can result
-%% in same match string then RegExpActions list define priority.
-
-make_dfa(REAs) -> make_dfa(REAs, 0).
-
-make_dfa(REAs0, Low) ->
- case parse_reas(REAs0) of
- {ok,REAs1} ->
- {NFA,Start0} = build_combined_nfa(REAs1),
- {DFA0,Start1} = build_dfa(NFA, Start0),
- {DFA,Start} = minimise_dfa(DFA0, Start1, Low),
- {ok,{DFA,Start}};
- {error,E} -> {error,E}
- end.
-
-parse_reas(REAs) -> parse_reas(REAs, []).
-
-parse_reas([{{regexp,{R,_Sc}},A}|REAs], S) -> %Already parsed
- parse_reas(REAs, [{R,A}|S]);
-parse_reas([{RegExp,A}|REAs], S) ->
- case parse(RegExp) of
- {ok,{regexp,{R,_Sc}}} -> parse_reas(REAs, [{R,A}|S]);
- {error,E} -> {error,E}
- end;
-parse_reas([], Stack) -> {ok,reverse(Stack)}.
-
-%% build_combined_nfa(RegExpActionList) -> {NFA,StartState}.
-%% Build the combined NFA using Thompson's construction straight out
-%% of the book. Build the separate NFAs in the same order as the
-%% rules so that the accepting have ascending states have ascending
-%% state numbers. Start numbering the states from 1 as we put the
-%% states in a tuple with the state number as the index.
-
-build_combined_nfa(REAs) ->
- {NFA,Starts,Next} = build_nfa_list(REAs, [], [], 1),
- F = #nfa_state{no=Next,edges=epsilon_trans(Starts),accept=no},
- {[F|NFA],Next}.
-
-build_nfa_list([{RE,Action}|REAs], NFA0, Starts, Next0) ->
- {NFA1,Next1,Start} = build_nfa(RE, Next0, Action),
- build_nfa_list(REAs, NFA1 ++ NFA0, [Start|Starts], Next1);
-build_nfa_list([], NFA, Starts, Next) ->
- {NFA,reverse(Starts),Next}.
-
-epsilon_trans(Sts) -> [ {epsilon,S} || S <- Sts ].
-
-%% build_nfa(RegExp, NextState, Action) -> {NFA,NextFreeState,StartState}.
-%% When building the NFA states for a ??? we don't build the end
-%% state, just allocate a State for it and return this state
-%% number. This allows us to avoid building unnecessary states for
-%% concatenation which would then have to be removed by overwriting
-%% an existing state.
-
-build_nfa(RE, Next, Action) ->
- {NFA,N,E} = build_nfa(RE, Next+1, Next, []),
- {[#nfa_state{no=E,accept={yes,Action}}|NFA],N,Next}.
-
-%% build_nfa(RegExp, NextState, StartState, NFA) -> {NFA,NextState,EndState}.
-%% The NFA is a list of nfa_state is no predefined order. The state
-%% number of the returned EndState is already allocated!
-
-build_nfa({'or',RE1,RE2}, N0, S, NFA0) ->
- {NFA1,N1,E1} = build_nfa(RE1, N0+1, N0, NFA0),
- {NFA2,N2,E2} = build_nfa(RE2, N1+1, N1, NFA1),
- E = N2,
- {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,N1}]},
- #nfa_state{no=E1,edges=[{epsilon,E}]},
- #nfa_state{no=E2,edges=[{epsilon,E}]}|NFA2],
- N2+1,E};
-build_nfa({literal,[]}, N, S, NFA) ->
- {NFA,N,S};
-build_nfa({literal,[C|Cs]}, N0, S, NFA0) ->
- {NFA1,N1,E1} = build_nfa(C, N0, S, NFA0),
- build_nfa({literal,Cs}, N1, E1, NFA1);
-build_nfa({concat,RE1,RE2}, N0, S, NFA0) ->
- {NFA1,N1,E1} = build_nfa(RE1, N0, S, NFA0),
- {NFA2,N2,E2} = build_nfa(RE2, N1, E1, NFA1),
- {NFA2,N2,E2};
-build_nfa({kclosure,RE}, N0, S, NFA0) ->
- {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
- E = N1,
- {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]},
- #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1],
- N1+1,E};
-build_nfa({pclosure,RE}, N0, S, NFA0) ->
- {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
- E = N1,
- {[#nfa_state{no=S,edges=[{epsilon,N0}]},
- #nfa_state{no=E1,edges=[{epsilon,N0},{epsilon,E}]}|NFA1],
- N1+1,E};
-build_nfa({optional,RE}, N0, S, NFA0) ->
- {NFA1,N1,E1} = build_nfa(RE, N0+1, N0, NFA0),
- E = N1,
- {[#nfa_state{no=S,edges=[{epsilon,N0},{epsilon,E}]},
- #nfa_state{no=E1,edges=[{epsilon,E}]}|NFA1],
- N1+1,E};
-build_nfa({iclosure,RE,I1,I2}, N, S, NFA) ->
- Exp = expand_iclosure(RE, I1, I2),
- build_nfa(Exp, N, S, NFA);
-build_nfa({char_class,Cc}, N, S, NFA) ->
- {[#nfa_state{no=S,edges=[{nfa_char_class(Cc),N}]}|NFA],N+1,N};
-build_nfa({comp_class,Cc}, N, S, NFA) ->
- {[#nfa_state{no=S,edges=[{nfa_comp_class(Cc),N}]}|NFA],N+1,N};
-build_nfa(epsilon, N, S, NFA) ->
- {NFA,N,S};
-build_nfa({group,RE}, N, S, NFA) -> %%% FIXME %%%%%%%
- build_nfa(RE, N, S, NFA);
-build_nfa({subexpr,_N,RE}, N, S, NFA) -> %%% FIXME %%%%%%%
- build_nfa(RE, N, S, NFA);
-build_nfa(bos, N, S, NFA) ->
- {[#nfa_state{no=S,edges=[{[bos],N}]}|NFA],N+1,N};
-build_nfa(eos, N, S, NFA) ->
- {[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N};
-%%{[#nfa_state{no=S,edges=[{[eos],N}]}|NFA],N+1,N};
-build_nfa(C, N, S, NFA) when is_integer(C) ->
- {[#nfa_state{no=S,edges=[{[{C,C}],N}]}|NFA],N+1,N}.
-
-nfa_char_class(Cc) ->
- Crs = lists:foldl(fun({C1,C2}, Set) -> add_element({C1,C2}, Set);
- (C, Set) -> add_element({C,C}, Set) end, [], Cc),
- %% ?dbg("cc: ~p\n", [Crs]),
- pack_crs(Crs).
-
-pack_crs([{C1,C2}=Cr,{C3,C4}|Crs]) when C1 =< C3, C2 >= C4 ->
- %% C1 C2
- %% C3 C4
- pack_crs([Cr|Crs]);
-pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 >= C3, C2 < C4 ->
- %% C1 C2
- %% C3 C4
- pack_crs([{C1,C4}|Crs]);
-pack_crs([{C1,C2},{C3,C4}|Crs]) when C2 + 1 == C3 ->
- %% C1 C2
- %% C3 C4
- pack_crs([{C1,C4}|Crs]);
-pack_crs([Cr|Crs]) -> [Cr|pack_crs(Crs)];
-pack_crs([]) -> [].
-
-nfa_comp_class(Cc) ->
- Crs = nfa_char_class(Cc),
- %% ?dbg("comp: ~p\n", [Crs]),
- comp_crs(Crs, 0).
-
-comp_crs([{C1,C2}|Crs], Last) ->
- [{Last,C1-1}|comp_crs(Crs, C2+1)];
-comp_crs([], Last) -> [{Last,maxchar}].
-
-%% build_dfa(NFA, NfaStartState) -> {DFA,DfaStartState}.
-%% Build a DFA from an NFA using "subset construction". The major
-%% difference from the book is that we keep the marked and unmarked
-%% DFA states in separate lists. New DFA states are added to the
-%% unmarked list and states are marked by moving them to the marked
-%% list. We assume that the NFA accepting state numbers are in
-%% ascending order for the rules and use ordsets to keep this order.
-
-build_dfa(NFA0, Start) ->
- %% We want NFA as sorted tuple for fast access, assume lowest state 1.
- NFA1 = list_to_tuple(keysort(#nfa_state.no, NFA0)),
- D = #dfa_state{no=0,nfa=eclosure([Start], NFA1),accept=no},
- {build_dfa([D], 1, [], NFA1),0}.
-
-%% build_dfa([UnMarked], NextState, [Marked], NFA) -> DFA.
-%% Traverse the unmarked states. Temporarily add the current unmarked
-%% state to the marked list before calculating translation, this is
-%% to avoid adding too many duplicate states. Add it properly to the
-%% marked list afterwards with correct translations.
-
-build_dfa([U|Us0], N0, Ms, NFA) ->
- {Ts,Us1,N1} = build_dfa(U#dfa_state.nfa, Us0, N0, [], [U|Ms], NFA),
- M = U#dfa_state{trans=Ts,accept=accept(U#dfa_state.nfa, NFA)},
- build_dfa(Us1, N1, [M|Ms], NFA);
-build_dfa([], _N, Ms, _NFA) -> Ms.
-
-%% build_dfa([NfaState], [Unmarked], NextState, [Transition], [Marked], NFA) ->
-%% {Transitions,UnmarkedStates,NextState}.
-%% Foreach NFA state set calculate the legal translations. N.B. must
-%% search *BOTH* the unmarked and marked lists to check if DFA state
-%% already exists. As the range of characters is potentially VERY
-%% large we cannot explicitly test all characters. Instead we first
-%% calculate the set of all disjoint character ranges which are
-%% possible candidates to the set of NFA states.
-
-build_dfa(Set, Us, N, Ts, Ms, NFA) ->
- %% List of all transition sets.
- Crs0 = [Cr || S <- Set,
- {Crs,_St} <- (element(S, NFA))#nfa_state.edges,
- is_list(Crs),
- Cr <- Crs ],
- Crs1 = lists:usort(Crs0), %Must remove duplicates!
- %% Build list of disjoint test ranges.
- Test = disjoint_crs(Crs1),
- %% ?dbg("bd: ~p\n ~p\n ~p\n ~p\n", [Set,Crs0,Crs1,Test]),
- build_dfa(Test, Set, Us, N, Ts, Ms, NFA).
-
-%% disjoint_crs([CharRange]) -> [CharRange].
-%% Take a sorted list of char ranges and make a sorted list of
-%% disjoint char ranges. No new char range extends past an existing
-%% char range.
-
-disjoint_crs([{_C1,C2}=Cr1,{C3,_C4}=Cr2|Crs]) when C2 < C3 ->
- %% C1 C2
- %% C3 C4
- [Cr1|disjoint_crs([Cr2|Crs])];
-disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 == C3 ->
- %% C1 C2
- %% C3 C4
- [{C1,C2}|disjoint_crs(add_element({C2+1,C4}, Crs))];
-disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 >= C3, C2 < C4 ->
- %% C1 C2
- %% C3 C4
- [{C1,C3-1}|disjoint_crs(union([{C3,C2},{C2+1,C4}], Crs))];
-disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 == C4 ->
- %% C1 C2
- %% C3 C4
- [{C1,C3-1}|disjoint_crs(add_element({C3,C4}, Crs))];
-disjoint_crs([{C1,C2},{C3,C4}|Crs]) when C1 < C3, C2 > C4 ->
- %% C1 C2
- %% C3 C4
- [{C1,C3-1}|disjoint_crs(union([{C3,C4},{C4+1,C2}], Crs))];
-disjoint_crs([Cr|Crs]) -> [Cr|disjoint_crs(Crs)];
-disjoint_crs([]) -> [].
-
-build_dfa([Cr|Crs], Set, Us, N, Ts, Ms, NFA) ->
- case eclosure(move(Set, Cr, NFA), NFA) of
- S when S /= [] ->
- case keysearch(S, #dfa_state.nfa, Us) of
- {value,#dfa_state{no=T}} ->
- build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA);
- false ->
- case keysearch(S, #dfa_state.nfa, Ms) of
- {value,#dfa_state{no=T}} ->
- build_dfa(Crs, Set, Us, N, [{Cr,T}|Ts], Ms, NFA);
- false ->
- U = #dfa_state{no=N,nfa=S},
- build_dfa(Crs, Set, [U|Us], N+1, [{Cr,N}|Ts], Ms, NFA)
- end
- end;
- [] ->
- build_dfa(Crs, Set, Us, N, Ts, Ms, NFA)
- end;
-build_dfa([], _Set, Us, N, Ts, _Ms, _NFA) ->
- {Ts,Us,N}.
-
-%% eclosure([State], NFA) -> [State].
-%% move([State], Char, NFA) -> [State].
-%% These are straight out of the book. As eclosure uses ordsets then
-%% the generated state sets are in ascending order.
-
-eclosure(Sts, NFA) -> eclosure(Sts, NFA, []).
-
-eclosure([St|Sts], NFA, Ec) ->
- #nfa_state{edges=Es} = element(St, NFA),
- eclosure([ N || {epsilon,N} <- Es,
- not is_element(N, Ec) ] ++ Sts,
- NFA, add_element(St, Ec));
-eclosure([], _NFA, Ec) -> Ec.
-
-move(Sts, Cr, NFA) ->
- [ St || N <- Sts,
- {Crs,St} <- (element(N, NFA))#nfa_state.edges,
- is_list(Crs),
-%% begin
-%% ?dbg("move1: ~p\n", [{Sts,Cr,Crs,in_crs(Cr,Crs)}]),
-%% true
-%% end,
- in_crs(Cr, Crs) ].
-
-in_crs({C1,C2}, [{C3,C4}|_Crs]) when C1 >= C3, C2 =< C4 -> true;
-in_crs(Cr, [Cr|_Crs]) -> true; %Catch bos and eos.
-in_crs(Cr, [_|Crs]) -> in_crs(Cr, Crs);
-in_crs(_Cr, []) -> false.
-
-%% accept([State], NFA) -> true | false.
-%% Scan down the state list until we find an accepting state.
-
-accept([St|Sts], NFA) ->
- case element(St, NFA) of
- #nfa_state{accept={yes,A}} -> {yes,A};
- #nfa_state{accept=no} -> accept(Sts, NFA)
- end;
-accept([], _NFA) -> no.
-
-%% minimise_dfa(DFA, StartState, FirstState) -> {DFA,StartState}.
-%% Minimise the DFA by removing equivalent states. We consider a
-%% state if both the transitions and the their accept state is the
-%% same. First repeatedly run through the DFA state list removing
-%% equivalent states and updating remaining transitions with
-%% remaining equivalent state numbers. When no more reductions are
-%% possible then pack the remaining state numbers to get consecutive
-%% states.
-
-minimise_dfa(DFA0, Start, N) ->
- case min_dfa(DFA0) of
- {DFA1,[]} -> %No reduction!
- {DFA2,Rs} = pack_dfa(DFA1, N),
- {min_update(DFA2, Rs),min_new_state(Start, Rs)};
- {DFA1,Rs} ->
- minimise_dfa(min_update(DFA1, Rs), min_new_state(Start, Rs), N)
- end.
-
-min_dfa(DFA) -> min_dfa(DFA, [], []).
-
-min_dfa([D|DFA0], Rs0, MDFA) ->
- {DFA1,Rs1} = min_delete(DFA0, D#dfa_state.trans, D#dfa_state.accept,
- D#dfa_state.no, Rs0, []),
- min_dfa(DFA1, Rs1, [D|MDFA]);
-min_dfa([], Rs, MDFA) -> {MDFA,Rs}.
-
-min_delete([#dfa_state{no=N,trans=T,accept=A}|DFA], T, A, NewN, Rs, MDFA) ->
- min_delete(DFA, T, A, NewN, [{N,NewN}|Rs], MDFA);
-min_delete([D|DFA], T, A, NewN, Rs, MDFA) ->
- min_delete(DFA, T, A, NewN, Rs, [D|MDFA]);
-min_delete([], _T, _A, _NewN, Rs, MDFA) -> {MDFA,Rs}.
-
-min_update(DFA, Rs) ->
- [ D#dfa_state{trans=min_update_trans(D#dfa_state.trans, Rs)} || D <- DFA ].
-
-min_update_trans(Tr, Rs) ->
- [ {C,min_new_state(S, Rs)} || {C,S} <- Tr ].
-
-min_new_state(Old, [{Old,New}|_Reds]) -> New;
-min_new_state(Old, [_R|Reds]) -> min_new_state(Old, Reds);
-min_new_state(Old, []) -> Old.
-
-pack_dfa(DFA, N) -> pack_dfa(DFA, N, [], []).
-
-pack_dfa([D|DFA], NewN, Rs, PDFA) ->
- pack_dfa(DFA, NewN+1, [{D#dfa_state.no,NewN}|Rs],
- [D#dfa_state{no=NewN}|PDFA]);
-pack_dfa([], _NewN, Rs, PDFA) -> {PDFA,Rs}.
-
-%% comp_apply(String, StartPos, DFAReg) -> {match,RestPos,Rest} | nomatch.
-%% Apply the DFA of a regular expression to a string. If
-%% there is a match return the position of the remaining string and
-%% the string if else return 'nomatch'.
-%%
-%% StartPos should be the real start position as it is used to decide
-%% if we are at the beginning of the string.
-
-comp_apply(Cs, P, {DFA,Start,_Fail}) ->
- comp_apply(element(Start, DFA), Cs, P, DFA, nomatch).
-
-comp_apply(#c_state{spec=[]}=St, Cs, P, DFA, Accept) ->
- comp_apply_tr(St, Cs, P, DFA, Accept);
-comp_apply(#c_state{spec=Sp}=St, Cs, P, DFA, Accept) ->
- comp_apply_sp(St, Cs, P, DFA, Accept, Sp).
-
-comp_apply_tr(#c_state{trans=none,accept=A}, Cs, P, _DFA, Accept) ->
- %% End state.
- accept_value(A, Cs, P, Accept);
-comp_apply_tr(#c_state{trans=Tr,tmin=Tmin,smin=Smin,tmax=Tmax,smax=Smax,accept=A},
- [C|Cs]=Cs0, P, DFA, Accept) ->
- %% Get the next state number to go to.
- NextSt = if C =< Tmin -> Smin; %Below transition table
- C >= Tmax -> Smax; %Above transition table
- true -> %Otherwise use table
- element(C - Tmin, Tr)
- end,
- comp_apply(element(NextSt, DFA), Cs, P+1, DFA,
- accept_value(A, Cs0, P, Accept));
-comp_apply_tr(#c_state{trans=_Tr,accept=A}, [], P, _DFA, Accept) ->
- accept_value(A, [], P, Accept).
-
-comp_apply_sp(_St, Cs, 1, DFA, Accept, [{bos,S}|_]) ->
- comp_apply(element(S, DFA), Cs, 1, DFA, Accept);
-comp_apply_sp(_St, [$\n], P, DFA, Accept, [{eos,S}|_]) ->
- comp_apply(element(S, DFA), [], P, DFA, Accept);
-comp_apply_sp(_St, [], P, DFA, Accept, [{eos,S}|_]) ->
- comp_apply(element(S, DFA), [], P, DFA, Accept);
-comp_apply_sp(St, Cs, P, DFA, Accept, [_|Sp]) ->
- comp_apply_sp(St, Cs, P, DFA, Accept, Sp);
-comp_apply_sp(St, Cs, P, DFA, Accept, []) ->
- comp_apply_tr(St, Cs, P, DFA, Accept).
-
-accept_value(true, Cs, P, _Accept) -> {match,P,Cs};
-accept_value(false, _Cs, _P, Accept) -> Accept.
-
-%% compile(RegExp) -> {ok,RE} | {error,E}.
-%% Parse the regexp described in the string RegExp.
-
-compile(RegExp) ->
- case make_dfa([{RegExp,yes}], 2) of
- {ok,{DFA0,Start}} ->
- Fail = 1,
- DFA1 = [#dfa_state{no=Fail,accept=no,trans=[]}|DFA0],
- DFA = tuplelise_dfa(DFA1, 1),
- {ok,{comp_regexp,{DFA,Start,Fail}}};
- {error,E} -> {error,E}
- end.
-
-%% tuplelise_dfa(DFAstates, NoAcceptState) -> {{CompState},FirstState}.
-
-tuplelise_dfa(DFA0, NoAccept) ->
- DFA1 = map(fun (#dfa_state{no=N,trans=Ts,accept=A}) ->
- {Tr,Tmin,Smin,Tmax,Smax,Sp} = build_trans(Ts, NoAccept),
- #c_state{no=N,trans=Tr,tmin=Tmin,smin=Smin,
- tmax=Tmax,smax=Smax,
- accept=fix_accept(A),spec=Sp}
- end, DFA0),
- list_to_tuple(keysort(#dfa_state.no, DFA1)).
-
-build_trans(Ts0, NoAccept) ->
- %% Split transitions into character ranges and specials.
- {Ts1,Sp1} = foldl(fun ({{_,_},_}=T, {Ts,Sp}) -> {[T|Ts],Sp};
- ({_,_}=T, {Ts,Sp}) -> {Ts,[T|Sp]}
- end, {[],[]}, Ts0),
- if Ts1 == [] ->
- {none,none,none,none,none,Sp1};
- true ->
- %% Have transitions, convert to tuple.
- Ts2 = keysort(1, Ts1),
- {Tmin,Smin,Ts3} = min_trans(Ts2, NoAccept),
- %% ?dbg("exptr: ~p\n", [{Ts3,Tmin}]),
- {Trans,Tmax,Smax} = expand_trans(Ts3, Tmin, NoAccept),
- {list_to_tuple(Trans),Tmin,Smin,Tmax,Smax,Sp1}
- end.
-
-min_trans([{{0,C2},S}|Crs], _Def) -> {C2,S,Crs};
-min_trans([{{C1,_C2},_S}|_]=Crs, Def) -> {C1-1,Def,Crs}.
-
-expand_trans([{{C1,maxchar},S}], Last, Def) ->
- Trs = duplicate(C1-(Last+1), Def),
- {Trs,C1,S};
-expand_trans([{{C1,C2},S}], Last, Def) ->
- Trs = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S),
- {Trs,C2+1,Def};
-expand_trans([{{C1,C2},S}|Crs], Last, Def) ->
- {Trs0,Tmax,Smax} = expand_trans(Crs, C2, Def),
- Trs1 = duplicate(C1-(Last+1), Def) ++ duplicate(C2-C1+1, S) ++ Trs0,
- {Trs1,Tmax,Smax}.
-
-fix_accept({yes,_}) -> true;
-fix_accept(no) -> false.
-
diff --git a/lib/xmerl/src/xmerl_xsd_re.erl b/lib/xmerl/src/xmerl_xsd_re.erl
new file mode 100644
index 0000000000..9e7810f2ae
--- /dev/null
+++ b/lib/xmerl/src/xmerl_xsd_re.erl
@@ -0,0 +1,141 @@
+%%
+%% %CopyrightBegin%
+%%
+%% Copyright Ericsson AB 2003-2025. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%% http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%
+%% %CopyrightEnd%
+%%
+
+-module(xmerl_xsd_re).
+
+-export([map/1]). %% api
+
+-export([scan/1]). %% test
+
+%% map/1
+%%
+%% Map an XSD 1.0 regular expression to an equivalent PCRE regular
+%% expression as understood by re(3).
+
+-spec map(binary()) -> iodata().
+
+map(Bin) ->
+ case xmerl_xsd_re_parse:parse(scan(Bin)) of
+ {ok, RE} ->
+ %% io:format("map RE: ~p\n\n", [RE]),
+ RE;
+ {error, Reason} ->
+ %% io:format("map error reason: \n~p\n\n", [Reason]),
+ error({?MODULE, Reason})
+ end.
+
+%% scan/1
+%%
+%% Scanner for XSD 1.0 regular expressions as required by yecc. Just
+%% breaks the input into metacharacters, escapes (SingleCharEsc,
+%% MultiCharEsc, CatEsc/ComplEsc), digits, and other characters. Scan
+%% the entire input in one go since regular expressions aren't
+%% expected to be overly huge.
+
+-spec scan(binary()) -> [Tok]
+ when Tok :: {Sym, Pos}
+ | {Cat, Pos, Chr},
+ Sym :: eof
+ | '.' | '?' | '*' | '+' | '(' | ')' | '|' | '[' | ']'
+ | '{' | '}' | ',' | '-'
+ | '^' | '$' | ':',
+ Cat :: digit | multi | single | property | other,
+ Pos :: non_neg_integer(),
+ Chr :: pos_integer().
+
+scan(Bin) ->
+ scan(Bin, 0).
+
+%% scan/2
+
+scan(<<>>, N) ->
+ [{eof, N}];
+
+scan(<<$\\, C, B/binary>>, N) %% SingleCharEsc
+ when C == $n;
+ C == $r;
+ C == $t;
+ C == $\\;
+ C == $|;
+ C == $.;
+ C == $?;
+ C == $*;
+ C == $+;
+ C == $(;
+ C == $);
+ C == ${;
+ C == $};
+ C == $-;
+ C == $[;
+ C == $];
+ C == $^ ->
+ [{single, N, C} | scan(B, N+2)];
+
+scan(<<$\\, C, B/binary>>, N) %% MultiCharEsc
+ when C == $s;
+ C == $S;
+ C == $i;
+ C == $I;
+ C == $c;
+ C == $C;
+ C == $d;
+ C == $D;
+ C == $w;
+ C == $W ->
+ [{multi, N, C} | scan(B, N+2)];
+
+scan(<<$\\, C, B/binary>>, N)
+ when C == $p;
+ C == $P ->
+ [{property, N, C} | scan(B, N+2)];
+
+scan(<<C/utf8, B/binary>>, N)
+ when C /= $\\ ->
+ [chr(C, N) | scan(B, N+1)];
+
+scan(B, N) ->
+ error({?MODULE, N, B}).
+
+%% chr/2
+
+chr(C, N)
+ when C == $.;
+ C == $?;
+ C == $*;
+ C == $+;
+ C == $(;
+ C == $);
+ C == $|;
+ C == $[;
+ C == $];
+ C == ${;
+ C == $};
+ C == $,;
+ C == $-;
+ C == $^;
+ C == $$;
+ C == $: ->
+ {list_to_atom([C]), N};
+
+chr(C, N) when $0 =< C, C =< $9 ->
+ {digit, N, C};
+
+chr(C, N) ->
+ {other, N, C}.
diff --git a/lib/xmerl/src/xmerl_xsd_re_parse.yrl b/lib/xmerl/src/xmerl_xsd_re_parse.yrl
new file mode 100644
index 0000000000..dc6b35a9d6
--- /dev/null
+++ b/lib/xmerl/src/xmerl_xsd_re_parse.yrl
@@ -0,0 +1,578 @@
+Header "%%
+%% %CopyrightBegin%
+%%
+%% Copyright Ericsson AB 2003-2025. All Rights Reserved.
+%%
+%% Licensed under the Apache License, Version 2.0 (the \"License\");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%% http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an \"AS IS\" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%
+%% %CopyrightEnd%
+%%".
+
+%%
+%% A grammar for the XSD 1.0 regular expression used by the YANG
+%% pattern statement:
+%%
+%% https://www.w3.org/TR/2004/REC-xmlschema-2-20041028/#regexs
+%%
+%% Produces an equivalent re(3) regular expression as an iolist(),
+%% mapping constructs as required.
+%%
+%% The 1.0 grammar is ambiguous in several places, rules being stated
+%% in text to try an remove the ambiguities. There are still problems
+%% however, which are noted in comments below. In particular, { as a
+%% regular character (Char) means that arbitrary lookahead is required
+%% to decide of it should be interpreted as a qualification or not;
+%% for example, 0{12} vs 0{12. The 1.1 specification recognizes this
+%% and require both { and } to be escaped, which is the solution
+%% adopted here: it's not clear if this is just a blunder in the 1.0
+%% spec, but probably since {} are listed as metacharacters that
+%% require escape to be interpreted as a normal character and no
+%% mention is made of the issue.
+%%
+
+Nonterminals
+ %% from the 1.0 grammar:
+ regExp branch piece atom
+ quantifier quantity 'QuantExact'
+ charClass 'Char' 'WildcardEsc'
+ charClassExpr charGroup
+ 'XmlChar' 'SingleCharEsc' 'MultiCharEsc' charClassEsc catEsc
+ charProp
+ %% adaptations for yecc and disambiguation
+ 'quantifier?' 'quantity?' 'quantmax?'
+ 'branch*' 'digit*' 'subtract?' group
+ 'group*' char
+ block 'block*' blockchar.
+
+Terminals
+ '.' '?' '*' '+' '(' ')' '|' '[' ']' %% not a Normal Character
+ '{' '}' ',' '-' %% Quantifier
+ '^' '$' ':'
+ digit single multi property other.
+
+Rootsymbol regExp.
+
+Endsymbol eof.
+
+%% ===========================================================================
+%% This is the XSD grammar with some modifications to avoid lookahead
+%% and otherwise adapt to yecc. The original XSD productions are
+%% provided as comments, followed by their yecc implementation.
+
+%% [1] regExp ::= branch ( '|' branch )*
+
+%% XSD regular expressions are implicitly anchored at both ends.
+regExp -> branch 'branch*' : ['$1' | '$2'].
+
+'branch*' -> '$empty' : [].
+'branch*' -> '|' regExp : [$| | '$2'].
+
+%% [2] branch ::= piece*
+
+branch -> '$empty' : [].
+branch -> piece branch : ['$1', '$2'].
+
+%% [3] piece ::= atom quantifier?
+
+piece -> atom 'quantifier?' : ['$1', '$2'].
+%% This is ambiguous since { can both be the start of a qualifier and
+%% and atom in its own right (through Char). For example, 0{1} is both
+%% a sequence of four atoms and one atom with a qualifier. Also,
+%% 0{12345 is a sequence of 7 atoms, so even if { as a qualifier takes
+%% precedence, this can require an unlimited amount of lookahead to
+%% decide that { isn't the start of a qualifier.
+%%
+%% This can't be intentional, and the grammar has changed in XSD 1.1
+%% to remove the ambiguity by disallowing {} in Char (renamed
+%% NormalChar). This is adopted here as the only reasonable solution.
+
+'quantifier?' -> '$empty' : [].
+'quantifier?' -> quantifier : '$1'.
+
+%% [4] quantifier ::= [?*+] | ( '{' quantity '}' )
+%% [5] quantity ::= quantRange | quantMin | QuantExact
+%% [6] quantRange ::= QuantExact ',' QuantExact
+%% [7] quantMin ::= QuantExact ','
+%% [8] QuantExact ::= [0-9]+
+
+quantifier -> '?' : "?".
+quantifier -> '*' : "*".
+quantifier -> '+' : "+".
+quantifier -> '{' quantity '}' : [${, '$2', $}].
+
+quantity -> 'QuantExact' 'quantity?' : ['$1' | '$2'].
+
+'quantity?' -> '$empty' : [].
+'quantity?' -> ',' 'quantmax?' : [$,, '$2'].
+
+'quantmax?' -> '$empty' : [].
+'quantmax?' -> 'QuantExact' : '$1'.
+
+'QuantExact' -> digit 'digit*' : [value('$1') | '$2'].
+
+'digit*' -> '$empty' : [].
+'digit*' -> digit 'digit*' : [value('$1') | '$2'].
+
+%% [9] atom ::= Char | charClass | ( '(' regExp ')' )
+
+atom -> 'Char' : '$1'.
+atom -> charClass : '$1'.
+atom -> '(' regExp ')' : [$(, '$2', $)].
+
+%% [10] Char ::= [^.\?*+()|#x5B#x5D]
+
+'Char' -> other : value('$1').
+'Char' -> digit : value('$1').
+
+'Char' -> ',' : ",".
+'Char' -> '-' : "-".
+
+%% ^ and $ are not metacharacters in XSD regular expressions.
+'Char' -> '^' : "\\^".
+'Char' -> '$' : "\\$".
+
+%% Allowing {} as the grammar specifies requires arbitrary lookahead
+%% to decide whether { is the start of a quantifier or an atom in its
+%% own right, assuming a quantifier takes precedence; for example,
+%% 0{12345 vs 0{12345}. The 1.1 grammar recognises this and disallows
+%% it, which is the only reasonable solution.
+%'Char' -> '{' : "{".
+%'Char' -> '}' : "}".
+
+'Char' -> ':' : ":".
+
+%% [11] charClass ::= charClassEsc | charClassExpr | WildcardEsc
+
+charClass -> 'SingleCharEsc' : '$1'.
+charClass -> charClassEsc : '$1'.
+charClass -> charClassExpr : '$1'.
+charClass -> 'WildcardEsc' : '$1'.
+
+%% [12] charClassExpr ::= '[' charGroup ']'
+%% [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
+%% [14] posCharGroup ::= ( charRange | charClassEsc )+
+%% [15] negCharGroup ::= '^' posCharGroup
+%% [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
+
+charClassExpr -> '[' charGroup ']' : '$2'.
+
+%% Parse ^- as normal characters (char below) and verify the textual
+%% rules on their use at the end of the CharClassExpr since expressing
+%% it in grammar seems rife with shift/reduce conflict.
+charGroup -> group 'subtract?' : group('$1', '$2').
+
+%% Character Class Subtraction doesn't exist in PRCE. For example,
+%% [a-d-[c]] means one of abd in XSD, but one of abcd-[ followed by ]
+%% in PCRE. Map to a negative lookahead assertion: (?![c])[a-d]. These
+%% nest with the expected semantics.
+'subtract?' -> '$empty' : [].
+'subtract?' -> charClassExpr : ["(?!", '$1', $)].
+
+group -> char 'group*' : ['$1' | '$2'].
+
+char -> '-' : $-.
+char -> '^' : $^.
+char -> 'XmlChar' : '$1'.
+char -> 'SingleCharEsc' : '$1'.
+char -> charClassEsc : {class, '$1'}.
+
+'group*' -> '$empty' : [].
+'group*' -> char 'group*' : ['$1' | '$2'].
+
+%% [17] charRange ::= seRange | XmlCharIncDash
+%% [18] seRange ::= charOrEsc '-' charOrEsc
+%% [20] charOrEsc ::= XmlChar | SingleCharEsc
+%% [21] XmlChar ::= [^\#x2D#x5B#x5D]
+%% [22] XmlCharIncDash ::= [^\#x5B#x5D]
+%%
+%% "A single XML character is a ·character range· that identifies the
+%% set of characters containing only itself. All XML characters are
+%% valid character ranges, except as follows:
+%%
+%% The [, ], - and \ characters are not valid character ranges;
+%% The ^ character is only valid at the beginning of a ·positive
+%% character group· if it is part of a ·negative character group·
+%% The - character is a valid character range only at the beginning
+%% or end of a ·positive character group.
+%%
+%% Note: The grammar for ·character range· as given above is
+%% ambiguous, but the second and third bullets above together
+%% remove the ambiguity."
+
+%% The second rule above disallows including ^ in a character group:
+%% it can be excluded with [^^], but not included with [^]. (Which the
+%% 1.1 specification explicitly notes.)
+
+%% Anything except \[], or ^- to deal with the aforementioned ambiguity.
+'XmlChar' -> '.' : ".".
+'XmlChar' -> '?' : "?".
+'XmlChar' -> '*' : "*".
+'XmlChar' -> '+' : "+".
+'XmlChar' -> '(' : "(".
+'XmlChar' -> ')' : ")".
+'XmlChar' -> '|' : "|".
+'XmlChar' -> '{' : "{".
+'XmlChar' -> '}' : "}".
+'XmlChar' -> ',' : ",".
+'XmlChar' -> '$' : "$". %% not a metacharacter in a PCRE range
+'XmlChar' -> ':' : "\\:". %% avoid [: :]
+'XmlChar' -> digit : value('$1').
+'XmlChar' -> other : value('$1').
+
+%% [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
+
+%% Move SingleCharEsc up to be able to differentiate between
+%% characters classes in a charGroup and as an atom.
+%charClassEsc -> 'SingleCharEsc' : '$1'.
+charClassEsc -> 'MultiCharEsc' : '$1'.
+charClassEsc -> catEsc : '$1'.
+
+%% [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
+
+'SingleCharEsc' -> single : escape('$1').
+
+%% [25] catEsc ::= '\p{' charProp '}'
+%% [26] complEsc ::= '\P{' charProp '}'
+
+catEsc -> property '{' charProp '}' : prop(value('$1'), '$3', prop('$3')).
+
+%% [27] charProp ::= IsCategory | IsBlock
+
+%% [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | Separators | Symbols | Others
+%% [29] Letters ::= 'L' [ultmo]?
+%% [30] Marks ::= 'M' [nce]?
+%% [31] Numbers ::= 'N' [dlo]?
+%% [32] Punctuation ::= 'P' [cdseifo]?
+%% [33] Separators ::= 'Z' [slp]?
+%% [34] Symbols ::= 'S' [mcko]?
+%% [35] Others ::= 'C' [cfon]?
+
+%% [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
+
+charProp -> block : '$1'.
+
+block -> blockchar 'block*' : ['$1' | '$2'].
+
+'block*' -> '$empty' : [].
+'block*' -> blockchar 'block*' : ['$1' | '$2'].
+
+blockchar -> other : value('$1').
+blockchar -> digit : value('$1').
+blockchar -> '-' : $-.
+
+%% [37] MultiCharEsc ::= '\' [sSiIcCdDwW]
+
+'MultiCharEsc' -> multi : escape('$1').
+
+%% [37a] WildcardEsc ::= '.'
+
+'WildcardEsc' -> '.' : ".".
+
+Erlang code.
+%% value/1
+
+value({_,_,C}) ->
+ C;
+value({A,_}) ->
+ atom_to_list(A).
+
+%% escape/1
+
+escape({_,_,$i}) ->
+ ["[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]"];
+escape({_,_,$I}) ->
+ ["[^:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]"];
+escape({_,_,$c}) ->
+ ["[-.0-9:A-Z_a-z\u00B7\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u037D\u037F-\u1FFF\u200C-\u200D\u203F\u2040\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]"];
+escape({_,_,$C}) ->
+ ["[^-.0-9:A-Z_a-z\u00B7\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u037D\u037F-\u1FFF\u200C-\u200D\u203F\u2040\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]"];
+escape({_,_,C}) ->
+ ["\\", C].
+
+%% group/2
+%%
+%% The mapping of character groups is slightly complex since neither
+%% Character Class Subtraction nor Block Escape exists in PCRE. The
+%% former is mapped to a negative lookup assertion, the latter to a
+%% range. The range is complemented if need be to deal with negation
+%% in either the escape (ie. \P) or the group.
+%%
+%% Examples:
+%%
+%% \P{IsBasicLatin} -> [^\U0000-\U+007F]
+%% [a-z\p{IsRunic}] -> [a-z\U+16A0-\U+16FF]
+%% [^a-z\p{IsRunic}] -> [^a-z\U+0000-\U+169F\U+1700-\U+FFFF]
+%% [0\P{IsBasicLatin}] -> [0\U+0080-\U+FFFF]
+%% [a-z\P{IsBasicLatin}-[ei]] -> (?!([e]|[i]))[a-z\U+0080-\U+FFFF]
+%%
+%% (Replace \U+XXXX with actual characters.)
+
+group(Grp, Sub) ->
+ [Neg, Pre | Rest] = group(Grp),
+ [Sub, $[, Neg, Pre, alt(Rest, Sub /= [], Neg == $^), $]].
+
+%% group/1
+%%
+%% Extract leading ^ and/or -.
+
+group([$^, C | L])
+ when C == $^;
+ C == $- ->
+ [$^, C | L];
+
+group([$^ | L]) ->
+ [$^, [] | L];
+
+group([$- | L]) ->
+ [[], $- | L];
+
+group([_|_] = L) ->
+ [[], [] | L].
+
+%% alt/3
+%%
+%% Return a character group in which Block Escapes are inlined. The
+%% error reporting isn't great, since it doesn't point at a location
+%% in the expression, which would require passing more information
+%% through the parse.
+
+%%alt([$^ | _], _, _) ->
+%% fail({invalid_range, $^});
+
+alt([$-, $-], false, _) ->
+ fail({invalid_range, "-"});
+
+alt([T, $-], Sub, Neg) ->
+ [case T of {class, C} -> class(Neg, C); _ -> T end, [$- || not Sub]];
+
+alt([$-], Sub, _) ->
+ [$- || not Sub];
+
+alt([_], true, _) ->
+ fail({invalid_range, "["});
+
+alt([$- | _], _, _) ->
+ fail({invalid_range, "-"});
+
+alt([{class, C} | Rest], Sub, Neg) ->
+ [class(Neg, C) | alt(Rest, Sub, Neg)];
+
+alt([C, $-, D | Rest], Sub, Neg) ->
+ [C, $-, D | alt(Rest, Sub, Neg)];
+
+alt([C | Rest], Sub, Neg) ->
+ [C | alt(Rest, Sub, Neg)];
+
+alt([] = L, false, _) ->
+ L;
+
+alt([], true, _) ->
+ fail({invalid_range, $[}).
+
+%% class/2
+%%
+%% Complement a range of a Block Escape if need be.
+
+class(Neg, [$[, P, A, $-, B, $]])
+ when P == "^", Neg;
+ P == [], not Neg ->
+ [A, $-, B];
+
+class(_, [$[, _, <<A/utf8>>, $-, <<B/utf8>>, $]]) ->
+ [[[<<L/utf8>>, $-, <<(A-1)/utf8>>] || L <- [0], L < A],
+ [[<<(B+1)/utf8>>, $-, <<H/utf8>>] || H <- [16#FFFF], B < H]];
+
+class(_, Cs) ->
+ Cs.
+
+%% property/3
+
+prop(C, B, ok) ->
+ ["\\", C, ${, B, $}];
+
+prop(C, _, {Start, End}) ->
+ [$[, [$^ || C == $P], <<Start/utf8>>, $-, <<End/utf8>>, $]];
+
+prop(_, Block, false) ->
+ fail({unknown_block, Block}).
+
+%% prop/1
+%%
+%% All of the 1-2 letter properties are supported by PCRE, but the
+%% latter also supports Cs, so guard that only XSD properties are
+%% parsed.
+
+prop([P])
+ when P == $L;
+ P == $M;
+ P == $N;
+ P == $P;
+ P == $Z;
+ P == $S;
+ P == $C ->
+ ok;
+
+prop([$L, C])
+ when C == $u;
+ C == $l;
+ C == $t;
+ C == $m;
+ C == $o ->
+ ok;
+
+prop([$M, C])
+ when C == $n;
+ C == $c;
+ C == $e ->
+ ok;
+
+prop([$N, C])
+ when C == $d;
+ C == $l;
+ C == $o ->
+ ok;
+
+prop([$P, C])
+ when C == $c;
+ C == $d;
+ C == $s;
+ C == $e;
+ C == $i;
+ C == $f;
+ C == $o ->
+ ok;
+
+prop([$Z, C])
+ when C == $s;
+ C == $l;
+ C == $p ->
+ ok;
+
+prop([$S, C])
+ when C == $m;
+ C == $c;
+ C == $k;
+ C == $o ->
+ ok;
+
+prop([$C, C])
+ when C == $c;
+ C == $f;
+ C == $o;
+ C == $n ->
+ ok;
+
+prop([$I, $s | Rest]) ->
+ block(Rest);
+
+prop(B) ->
+ fail({unknown_property, B}).
+
+%% block/1
+%%
+%% Some of these are supported by PCRE, but many aren't. Map to a
+%% character range in each case.
+
+block("BasicLatin") -> {16#0000, 16#007F};
+block("Latin-1Supplement") -> {16#0080, 16#00FF};
+block("LatinExtended-A") -> {16#0100, 16#017F};
+block("LatinExtended-B") -> {16#0180, 16#024F};
+block("IPAExtensions") -> {16#0250, 16#02AF};
+block("SpacingModifierLetters") -> {16#02B0, 16#02FF};
+block("CombiningDiacriticalMarks") -> {16#0300, 16#036F};
+block("Greek") -> {16#0370, 16#03FF};
+block("Cyrillic") -> {16#0400, 16#04FF};
+block("Armenian") -> {16#0530, 16#058F};
+block("Hebrew") -> {16#0590, 16#05FF};
+block("Arabic") -> {16#0600, 16#06FF};
+block("Syriac") -> {16#0700, 16#074F};
+block("Thaana") -> {16#0780, 16#07BF};
+block("Devanagari") -> {16#0900, 16#097F};
+block("Bengali") -> {16#0980, 16#09FF};
+block("Gurmukhi") -> {16#0A00, 16#0A7F};
+block("Gujarati") -> {16#0A80, 16#0AFF};
+block("Oriya") -> {16#0B00, 16#0B7F};
+block("Tamil") -> {16#0B80, 16#0BFF};
+block("Telugu") -> {16#0C00, 16#0C7F};
+block("Kannada") -> {16#0C80, 16#0CFF};
+block("Malayalam") -> {16#0D00, 16#0D7F};
+block("Sinhala") -> {16#0D80, 16#0DFF};
+block("Thai") -> {16#0E00, 16#0E7F};
+block("Lao") -> {16#0E80, 16#0EFF};
+block("Tibetan") -> {16#0F00, 16#0FFF};
+block("Myanmar") -> {16#1000, 16#109F};
+block("Georgian") -> {16#10A0, 16#10FF};
+block("HangulJamo") -> {16#1100, 16#11FF};
+block("Ethiopic") -> {16#1200, 16#137F};
+block("Cherokee") -> {16#13A0, 16#13FF};
+block("UnifiedCanadianAboriginalSyllabics") -> {16#1400, 16#167F};
+block("Ogham") -> {16#1680, 16#169F};
+block("Runic") -> {16#16A0, 16#16FF};
+block("Khmer") -> {16#1780, 16#17FF};
+block("Mongolian") -> {16#1800, 16#18AF};
+block("LatinExtendedAdditional") -> {16#1E00, 16#1EFF};
+block("GreekExtended") -> {16#1F00, 16#1FFF};
+block("GeneralPunctuation") -> {16#2000, 16#206F};
+block("SuperscriptsandSubscripts") -> {16#2070, 16#209F};
+block("CurrencySymbols") -> {16#20A0, 16#20CF};
+block("CombiningMarksforSymbols") -> {16#20D0, 16#20FF};
+block("LetterlikeSymbols") -> {16#2100, 16#214F};
+block("NumberForms") -> {16#2150, 16#218F};
+block("Arrows") -> {16#2190, 16#21FF};
+block("MathematicalOperators") -> {16#2200, 16#22FF};
+block("MiscellaneousTechnical") -> {16#2300, 16#23FF};
+block("ControlPictures") -> {16#2400, 16#243F};
+block("OpticalCharacterRecognition") -> {16#2440, 16#245F};
+block("EnclosedAlphanumerics") -> {16#2460, 16#24FF};
+block("BoxDrawing") -> {16#2500, 16#257F};
+block("BlockElements") -> {16#2580, 16#259F};
+block("GeometricShapes") -> {16#25A0, 16#25FF};
+block("MiscellaneousSymbols") -> {16#2600, 16#26FF};
+block("Dingbats") -> {16#2700, 16#27BF};
+block("BraillePatterns") -> {16#2800, 16#28FF};
+block("CJKRadicalsSupplement") -> {16#2E80, 16#2EFF};
+block("KangxiRadicals") -> {16#2F00, 16#2FDF};
+block("IdeographicDescriptionCharacters") -> {16#2FF0, 16#2FFF};
+block("CJKSymbolsandPunctuation") -> {16#3000, 16#303F};
+block("Hiragana") -> {16#3040, 16#309F};
+block("Katakana") -> {16#30A0, 16#30FF};
+block("Bopomofo") -> {16#3100, 16#312F};
+block("HangulCompatibilityJamo") -> {16#3130, 16#318F};
+block("Kanbun") -> {16#3190, 16#319F};
+block("BopomofoExtended") -> {16#31A0, 16#31BF};
+block("EnclosedCJKLettersandMonths") -> {16#3200, 16#32FF};
+block("CJKCompatibility") -> {16#3300, 16#33FF};
+block("CJKUnifiedIdeographsExtensionA") -> {16#3400, 16#4DB5};
+block("CJKUnifiedIdeographs") -> {16#4E00, 16#9FFF};
+block("YiSyllables") -> {16#A000, 16#A48F};
+block("YiRadicals") -> {16#A490, 16#A4CF};
+block("HangulSyllables") -> {16#AC00, 16#D7A3};
+block("PrivateUse") -> {16#E000, 16#F8FF};
+block("CJKCompatibilityIdeographs") -> {16#F900, 16#FAFF};
+block("AlphabeticPresentationForms") -> {16#FB00, 16#FB4F};
+block("ArabicPresentationForms-A") -> {16#FB50, 16#FDFF};
+block("CombiningHalfMarks") -> {16#FE20, 16#FE2F};
+block("CJKCompatibilityForms") -> {16#FE30, 16#FE4F};
+block("SmallFormVariants") -> {16#FE50, 16#FE6F};
+block("ArabicPresentationForms-B") -> {16#FE70, 16#FEFE};
+%block("Specials") -> {16#FEFF, 16#FEFF};
+block("HalfwidthandFullwidthForms") -> {16#FF00, 16#FFEF};
+%block("Specials") -> {16#FFF0, 16#FFFD};
+block("Specials") -> {16#FEFF, 16#FFFD};
+
+block(_) -> false.
+
+%% fail/1
+
+fail(T) ->
+ error({?MODULE, T}).
diff --git a/lib/xmerl/src/xmerl_xsd_type.erl b/lib/xmerl/src/xmerl_xsd_type.erl
index 612a4ae340..32f0725558 100644
--- a/lib/xmerl/src/xmerl_xsd_type.erl
+++ b/lib/xmerl/src/xmerl_xsd_type.erl
@@ -14,7 +14,7 @@
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
-%%
+%%
%% %CopyrightEnd%
%%
@@ -93,7 +93,7 @@ check_simpleType(double,Value,_S) ->
% extended format PnYnMnDTnHnMnS where n is an integer. The n value
% before S may include decimal fraction.
check_simpleType(duration,Value,_S) ->
- ?catch_exit(check_duration(Value),Value,invalid_duration);
+ ?catch_exit(check_duration(Value),Value,invalid_duration);
check_simpleType(dateTime,Value,_S) ->
?catch_exit(check_dateTime(Value),Value,invalid_dateTime);
check_simpleType(time,Value,_S) ->
@@ -269,7 +269,7 @@ check_duration("P"++Value) ->
{Date,Time}=lists:splitwith(fun($T) -> false;(_) -> true end,Value),
{ok,_} = check_duration_date(Date,["Y","M","D"]),
{ok,_} = check_duration_time(Time,["T","H","M","S"]).
-
+
check_duration_date("",_) ->
{ok,""};
check_duration_date(Date,[H|T]) ->
@@ -284,7 +284,7 @@ check_duration_date(Date,[H|T]) ->
end.
%% Time any combination of TnHnMfS
%% n unsigned integers and f unsigned decimal
-%%check_duration_time(Time,["T","H","M","S"])
+%%check_duration_time(Time,["T","H","M","S"])
check_duration_time("",[_H|_T]) ->
{ok,""};
check_duration_time(Time,[S]) ->
@@ -318,7 +318,7 @@ check_positive_integer(Value) ->
%% check_integer and thereof derived types
check_integer(Value) ->
{ok,list_to_integer(Value)}.
-
+
check_nonPositiveInteger(Value) ->
check_constr_int(Value,undefined,0,illegal_nonPositiveInteger).
@@ -371,7 +371,7 @@ check_constr_int(Value,Min,Max,ErrMsg) ->
{error,{ErrMsg}}
end.
-%% DateTime on form: '-'? yyyy '-' mm '-' dd 'T' hh ':' mm ':' ss
+%% DateTime on form: '-'? yyyy '-' mm '-' dd 'T' hh ':' mm ':' ss
%% ('.' s+)? (zzzzzz)?
check_dateTime("-"++DateTime) ->
check_dateTime(DateTime);
@@ -400,14 +400,14 @@ check_month(Str) ->
case check_positive_integer(Str) of
{ok,Int} when Int >= 1,Int =< 12 ->
{ok,Int};
- _ ->
+ _ ->
{error,{invalid_month,Str}}
end.
check_day(Str) ->
case check_positive_integer(Str) of
{ok,Int} when Int >= 1,Int =< 31 ->
{ok,Int};
- _ ->
+ _ ->
{error,{invalid_day,Str}}
end.
@@ -498,7 +498,7 @@ check_date(Date) ->
{ok,_}=check_year(Year),
{ok,_}=check_month(Month),
{ok,_}=check_day(Day).
-
+
%% gYearMonth on the form: '-'? ccyy '-' mm zzzzzz?
check_gYearMonth("-"++Value) ->
check_gYearMonth(Value);
@@ -531,7 +531,7 @@ check_gYear(Value) ->
Y
end,
{ok,_} = check_year(Year).
-
+
%% gMonthDay on the form: mm dd zzzzzz?
check_gMonthDay("--"++Value) ->
{M,"-"++DTZ} = lists:split(2,Value),
@@ -652,7 +652,7 @@ check_IDREF(Value) ->
check_IDREFS(Value) ->
check_list_type(Value,fun check_IDREF/1).
-
+
check_ENTITY(Value) ->
true = xmerl_lib:is_ncname(Value),
{ok,Value}.
@@ -665,11 +665,11 @@ check_list_type(Value,BaseTypeFun) ->
lists:foreach(BaseTypeFun,Tokens),
{ok,Value}.
-ns_whitespace(WS) when WS==16#9;WS==16#A;WS==16#D ->
+ns_whitespace(WS) when WS==16#9;WS==16#A;WS==16#D ->
true;
ns_whitespace(_) ->
false.
-
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% facet functions
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -708,7 +708,7 @@ facet_fun(Type,F) ->
end.
-length_fun(T,V)
+length_fun(T,V)
when T==string;T==normalizedString;T==token;
T=='Name';T=='NCName';T==language;T=='ID';
T=='IDREF';T=='IDREFS';T=='ENTITY';T=='ENTITIES';
@@ -735,7 +735,7 @@ length_fun(T,_V) ->
{error,{length_not_applicable_on,T}}
end.
-minLength_fun(T,V)
+minLength_fun(T,V)
when T==string;T==normalizedString;T==token;
T=='Name';T=='NCName';T==language;T=='ID';
T=='IDREF';T=='IDREFS';T=='ENTITY';T=='ENTITIES';
@@ -762,7 +762,7 @@ minLength_fun(T,_V) ->
{error,{minLength_not_applicable_on,T}}
end.
-maxLength_fun(T,V)
+maxLength_fun(T,V)
when T==string;T==normalizedString;T==token;
T=='Name';T=='NCName';T==language;T=='ID';
T=='IDREF';T=='IDREFS';T=='ENTITY';T=='ENTITIES';
@@ -789,21 +789,30 @@ maxLength_fun(T,_V) ->
{error,{maxLength_not_applicable_on,T}}
end.
-pattern_fun(_Type,RegExp) ->
- case xmerl_regexp:setup(RegExp) of
- {ok,RE} ->
- fun(Val) ->
- case xmerl_regexp:first_match(Val,RE) of
- {match,_,_} -> {ok,Val};
- _ -> {error,{pattern_mismatch,Val,RegExp}}
- end
- end;
- _ ->
- fun(Val) ->
- {error,{unsupported_pattern,Val,RegExp}}
- end
+pattern_fun(_Type, RegExp) ->
+ try
+ ReRegExp = xmerl_xsd_re:map(conv_list_to_binary(RegExp)),
+ BinReRegExp = unicode:characters_to_binary(lists:flatten(ReRegExp)),
+ {ok, CompiledRegExp} =
+ re:compile([$^, $(, BinReRegExp, $), $$], [no_auto_capture, unicode]),
+ fun(Val) ->
+ case re:run(conv_list_to_binary(Val), CompiledRegExp) of
+ {match, _} -> {ok, Val};
+ _ -> {error, {pattern_mismatch, Val, RegExp}}
+ end
+ end
+ catch
+ _ ->
+ fun(Val) ->
+ {error,{unsupported_pattern, Val, RegExp}}
+ end
end.
+conv_list_to_binary(V) when is_binary(V) ->
+ V;
+conv_list_to_binary(V) when is_list(V) ->
+ unicode:characters_to_binary(V).
+
enumeration_fun(_Type,V) ->
fun(Val) ->
case lists:member(Val,V) of
@@ -845,8 +854,8 @@ collapse_ws([H|T],Acc) ->
collapse_ws(T,[H|Acc]);
collapse_ws([],Acc) ->
lists:reverse(lists:dropwhile(fun($ ) ->true;(_) -> false end,Acc)).
-
-maxInclusive_fun(T,V)
+
+maxInclusive_fun(T,V)
when T==integer;T==positiveInteger;T==negativeInteger;
T==nonNegativeInteger;T==nonPositiveInteger;T==long;
T==unsignedLong;T==int;T==unsignedInt;T==short;
@@ -895,7 +904,7 @@ maxInclusive_fun(T,_V) ->
%% T==gMonth;T==gMonthDay;T==gDay ->
fun(_) -> {error,{maxInclusive,not_implemented_for,T}} end.
-maxExclusive_fun(T,V)
+maxExclusive_fun(T,V)
when T==integer;T==positiveInteger;T==negativeInteger;
T==nonNegativeInteger;T==nonPositiveInteger;T==long;
T==unsignedLong;T==int;T==unsignedInt;T==short;
@@ -904,7 +913,7 @@ maxExclusive_fun(T,V)
try list_to_integer(Val) < list_to_integer(V) of
true ->
{ok,Val};
- false ->
+ false ->
{error,{maxExclusive,Val,not_less_than,V}}
catch
_:_ ->
@@ -942,7 +951,7 @@ maxExclusive_fun(T,V) when T==dateTime ->
maxExclusive_fun(T,_V) ->
fun(_) -> {error,{maxExclusive,not_implemented_for,T}} end.
-minExclusive_fun(T,V)
+minExclusive_fun(T,V)
when T==integer;T==positiveInteger;T==negativeInteger;
T==nonNegativeInteger;T==nonPositiveInteger;T==long;
T==unsignedLong;T==int;T==unsignedInt;T==short;
@@ -951,7 +960,7 @@ minExclusive_fun(T,V)
try list_to_integer(Val) > list_to_integer(V) of
true ->
{ok,Val};
- false ->
+ false ->
{error,{minExclusive,Val,not_greater_than,V}}
catch
_:_ ->
@@ -989,7 +998,7 @@ minExclusive_fun(T,V) when T==dateTime ->
minExclusive_fun(T,_V) ->
fun(_) -> {error,{minExclusive,not_implemented_for,T}} end.
-minInclusive_fun(T,V)
+minInclusive_fun(T,V)
when T==integer;T==positiveInteger;T==negativeInteger;
T==nonNegativeInteger;T==nonPositiveInteger;T==long;
T==unsignedLong;T==int;T==unsignedInt;T==short;
@@ -998,7 +1007,7 @@ minInclusive_fun(T,V)
try list_to_integer(Val) >= list_to_integer(V) of
true ->
{ok,Val};
- false ->
+ false ->
{error,{minInclusive,Val,not_greater_than_or_equal_with,V}}
catch
_:_ ->
@@ -1035,7 +1044,7 @@ minInclusive_fun(T,V) when T==dateTime ->
end;
minInclusive_fun(T,_V) ->
fun(_) -> {error,{minInclusive,not_implemented_for,T}} end.
-
+
totalDigits_fun(T,V)
when T==integer;T==positiveInteger;T==negativeInteger;T==nonNegativeInteger;
T==nonPositiveInteger;T==long;T==unsignedLong;T==int;T==unsignedInt;
@@ -1051,7 +1060,7 @@ totalDigits_fun(T,V)
case lists:member($.,Val2) of
true ->
length(lists:dropwhile(Pred,lists:reverse(Val2))) -1;
- _ ->
+ _ ->
length(Val2)
end,
if
@@ -1063,12 +1072,12 @@ totalDigits_fun(T,V)
end;
totalDigits_fun(T,_V) ->
fun(_) -> {error,{totalDigits,not_applicable,T}} end.
-
+
fractionDigits_fun(T,V)
when T==integer;T==positiveInteger;T==negativeInteger;T==nonNegativeInteger;
T==nonPositiveInteger;T==long;T==unsignedLong;T==int;T==unsignedInt;
T==short;T==unsignedShort;T==byte;T==unsignedByte;T==decimal ->
- fun(Val) ->
+ fun(Val) ->
Len =
case string:tokens(Val,".") of
[_I,Frc] when T==decimal ->
@@ -1079,7 +1088,7 @@ fractionDigits_fun(T,V)
_ ->
0
end,
- if
+ if
Len =< V ->
{ok,Val};
true ->
@@ -1088,7 +1097,7 @@ fractionDigits_fun(T,V)
end;
fractionDigits_fun(T,_V) ->
fun(_) -> {error,{fractionDigits,not_applicable,T}} end.
-
+
%% The relation between F1 and F2 may be eq,lt or gt.
%% lt: F1 < F2
@@ -1130,13 +1139,13 @@ compare_floats2({S1,B1,D1,E1},{_S2,B2,D2,E2}) ->
I1 < I2 -> sign(S1,lt);
true ->
%% fractions are compared in lexicographic order
- if
+ if
D1 == D2 -> eq;
D1 < D2 -> sign(S1,lt);
D1 > D2 -> sign(S1,gt)
end
end.
-
+
str_to_float(String) ->
{Sign,Str} =
case String of
@@ -1172,7 +1181,7 @@ pow(Mantissa,Exponent) ->
end.
pow(Mantissa,Fraction,Exponent) ->
- (Mantissa * math:pow(10,Exponent)) +
+ (Mantissa * math:pow(10,Exponent)) +
(list_to_integer(Fraction) * math:pow(10,Exponent-length(Fraction))).
sign('-',gt) ->
@@ -1195,7 +1204,7 @@ remove_trailing_zeros(Str) ->
%% T==gMonth;T==gMonthDay;T==gDay ->
%% compare_duration(V1,V2) compares V1 to V2
-%% returns gt | lt | eq | indefinite
+%% returns gt | lt | eq | indefinite
%% ex: V1 > V2 -> gt
%%
%% V1, V2 on format PnYnMnDTnHnMnS
@@ -1283,7 +1292,7 @@ compare_dateTime(P,Q) when is_list(Q) ->
compare_dateTime(P,normalize_dateTime(dateTime_atoms(Q)));
compare_dateTime(_P,_Q) ->
indefinite.
-
+
fQuotient(A,B) when is_float(A) ->
fQuotient(erlang:floor(A),B);
fQuotient(A,B) when is_float(B) ->
@@ -1308,7 +1317,7 @@ modulo(A,B) ->
modulo(A, Low, High) ->
modulo(A - Low, High - Low) + Low.
-
+
maximumDayInMonthFor(YearValue, MonthValue) ->
M = modulo(MonthValue, 1, 13),
Y = YearValue + fQuotient(MonthValue, 1, 13),
@@ -1330,7 +1339,7 @@ monthValue(_M,Y) ->
28
end
end.
-
+
%% S dateTime, D duration
%% result is E dateTime, end of time period with start S and duration
%% D. E = S + D.
@@ -1357,25 +1366,25 @@ add_duration2dateTime2({Syear,Smonth,Sday,Shour,Sminute,Ssec,Szone},
Temp1 = Smonth + Dmonths,
Emonth = modulo(Temp1,1,13),
Carry1 = fQuotient(Temp1,1,13),
-
+
%% years
Eyear = Syear + Dyears + Carry1,
-
+
%% seconds
Temp2 = Ssec + Dsecs,
Esecs = modulo(Temp2,60),
Carry2 = fQuotient(Temp2,60),
-
+
%% minutes
Temp3 = Sminute + Dminutes + Carry2,
Eminute = modulo(Temp3,60),
Carry3 = fQuotient(Temp3,60),
-
+
%% hours
Temp4 = Shour + Dhours + Carry3,
Ehour = modulo(Temp4,24),
Carry4 = fQuotient(Temp4,24),
-
+
%% days
TempDays =
case maximumDayInMonthFor(Eyear,Emonth) of
@@ -1451,7 +1460,7 @@ zone_atoms(Sign,Zone) when is_list(Zone) ->
zone_atoms(_Sign,Zone) ->
Zone.
-
+
%% Format: '-'? PnYnMnDTnHnMnS
duration_atoms("-P"++Dur) ->
duration_atoms2(Dur,neg);
@@ -1539,8 +1548,8 @@ get_sec([$S|T],Acc,_) ->
{lists:reverse(Acc),T};
get_sec(_,_,Str) ->
{"0",Str}.
-
-
+
+
set_sign(pos,Istr) ->
list_to_integer(Istr);
set_sign(_,Istr) ->
@@ -1572,7 +1581,7 @@ normalize_dateTime({Y,M,D,Hour,Min,Sec,{Sign,ZH,ZM}}) ->
TmpHour = Hour + set_sign(invert_sign(Sign),integer_to_list(ZH)) + Carry1,
NHour = modulo(TmpHour,24),
Carry2 = fQuotient(TmpHour,24),
-
+
{NY,NM,ND} =
carry_loop(D+Carry2,M,Y),
{NY,NM,ND,NHour,NMin,Sec,{pos,0,0}};
diff --git a/lib/xmerl/test/xmerl_SUITE.erl b/lib/xmerl/test/xmerl_SUITE.erl
index 11ec26192f..4f3e99fbe8 100644
--- a/lib/xmerl/test/xmerl_SUITE.erl
+++ b/lib/xmerl/test/xmerl_SUITE.erl
@@ -653,7 +653,6 @@ allow_entities_test(Config) ->
(catch xmerl_scan:file(File, [{allow_entities, false}])),
ok.
-
%%======================================================================
%% Support Functions
%%======================================================================
diff --git a/lib/xmerl/test/xmerl_xsd_SUITE.erl b/lib/xmerl/test/xmerl_xsd_SUITE.erl
index 3060f27e6c..a568cd102c 100644
--- a/lib/xmerl/test/xmerl_xsd_SUITE.erl
+++ b/lib/xmerl/test/xmerl_xsd_SUITE.erl
@@ -66,7 +66,8 @@ groups() ->
sis2, state2file_file2state, union]},
{ticket_tests, [],
[ticket_6910, ticket_7165, ticket_7190, ticket_7288,
- ticket_7736, ticket_8599, ticket_9410, ticket_19792]},
+ ticket_7736, ticket_8599, ticket_9410, ticket_19762,
+ ticket_19792]},
{facets, [],
[length, minLength, maxLength, pattern, enumeration,
whiteSpace, maxInclusive, maxExclusive, minExclusive,
@@ -965,10 +965,14 @@ ticket_8599(Config) ->
{{xmlElement,persons,persons,_,_,_,_,_,_,_,_,_},_GlobalState} = xmerl_xsd:validate(E, S).
-
ticket_9410(Config) ->
- file:set_cwd(datadir_join(Config,[".."])),
- {ok, _S} = xmerl_xsd:process_schema("xmerl_xsd_SUITE_data/small.xsd").
+ {ok, _S} = xmerl_xsd:process_schema(datadir_join(Config,["small.xsd"])).
+
+ticket_19762(Config) ->
+ {E, _} = xmerl_scan:file(datadir_join(Config,["ticket_19762.xml"]),[]),
+ {ok, S} = xmerl_xsd:process_schema(datadir_join(Config,["ticket_19762.xsd"])),
+ {E, _} = xmerl_xsd:validate(E, S),
+ ok.
ticket_19792(Config) ->
diff --git a/lib/xmerl/test/xmerl_xsd_SUITE_data/ticket_19762.xml b/lib/xmerl/test/xmerl_xsd_SUITE_data/ticket_19762.xml
new file mode 100644
index 0000000000..792ebde504
--- /dev/null
+++ b/lib/xmerl/test/xmerl_xsd_SUITE_data/ticket_19762.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<body>
+ <test testattr="Testing"/>
+</body>
diff --git a/lib/xmerl/test/xmerl_xsd_SUITE_data/ticket_19762.xsd b/lib/xmerl/test/xmerl_xsd_SUITE_data/ticket_19762.xsd
new file mode 100644
index 0000000000..3af9f59a8c
--- /dev/null
+++ b/lib/xmerl/test/xmerl_xsd_SUITE_data/ticket_19762.xsd
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="file:///C:/xsd/unpack-1/LKF-XSD/LicFormat20_Schema_Rev_E_MOD.xsd">
+ <xs:attribute name="testattr">
+ <xs:simpleType>
+ <xs:restriction base="xs:string">
+ <xs:minLength value="5"/>
+ <xs:maxLength value="20"/>
+ <xs:pattern value="([a-zA-Z0-9\-\s/_])*"/>
+ </xs:restriction>
+ </xs:simpleType>
+ </xs:attribute>
+
+ <xs:element name="test">
+ <xs:complexType>
+ <xs:attribute ref="testattr" use="required"/>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:element name="body">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="test"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+</xs:schema>
--
2.51.0