Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
home:Ledest:erlang:19
erlang
3323-Implements-rest-of-missing-features-for-SA...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 3323-Implements-rest-of-missing-features-for-SAX-parser.patch of Package erlang
From 4d13e78579a5a4fca86d9e9e4cc5b4a127c6fa7f Mon Sep 17 00:00:00 2001 From: zadean <contact@zadean.com> Date: Thu, 12 Sep 2019 09:59:37 +0200 Subject: [PATCH 3/5] Implements rest of missing features for SAX parser [features] - External entities implemented, both as markup and replacement text - Cyclic references are checked at the end of the DTD and when found in external entities - Default attributes from DTD reported in events - Attribute whitespace normalization based on type - External entities can have a different encoding than the base document - Conditional sections (IGNORE/INCLUDE) handling added - Element content-model well-formed checks [possibly breaking change] - adds attribute_values field to xmerl_sax_parser_state record [bug fixes] - adds missing function clauses matching on empty string and individual characters - XML Version WFC - corrected handling of character references for '<' and '&' - not allowing fragments in System Literals - added '?' to allowed pubid characters - report errors in initial encoding check with `fatal_error` macro [cosmetic] - rename "litteral" to "literal" --- lib/xmerl/src/xmerl_sax_parser.erl | 5 +- lib/xmerl/src/xmerl_sax_parser.hrl | 3 +- lib/xmerl/src/xmerl_sax_parser_base.erlsrc | 1758 ++++++++++++++++++++++------ 3 files changed, 1412 insertions(+), 354 deletions(-) diff --git a/lib/xmerl/src/xmerl_sax_parser.erl b/lib/xmerl/src/xmerl_sax_parser.erl index 2767d02552..ab9dbd1ea1 100644 --- a/lib/xmerl/src/xmerl_sax_parser.erl +++ b/lib/xmerl/src/xmerl_sax_parser.erl @@ -312,7 +312,10 @@ convert_encoding(Enc, State) -> %% Just for 7,8 bit + utf8 %% Description: Parsing the xml declaration from the input stream. %%---------------------------------------------------------------------- parse_xml_directive(<<C, Rest/binary>>, State) when ?is_whitespace(C) -> - parse_xml_directive_1(Rest, [], State). + parse_xml_directive_1(Rest, [], State); +parse_xml_directive(_, State) -> + ?fatal_error(State, "Expected whitespace in directive"). + %%---------------------------------------------------------------------- %% Function: parse_xml_directive_1(Xml, Acc) -> [{Name, Value}] diff --git a/lib/xmerl/src/xmerl_sax_parser.hrl b/lib/xmerl/src/xmerl_sax_parser.hrl index 56a3a42e5f..564e7fd541 100644 --- a/lib/xmerl/src/xmerl_sax_parser.hrl +++ b/lib/xmerl/src/xmerl_sax_parser.hrl @@ -88,7 +88,8 @@ current_location, % Location of the currently parsed XML entity entity, % Parsed XML entity skip_external_dtd = false,% If true the external DTD is skipped during parsing - input_type % Source type: file | stream + input_type, % Source type: file | stream + attribute_values = [] % default attribute values }). diff --git a/lib/xmerl/src/xmerl_sax_parser_base.erlsrc b/lib/xmerl/src/xmerl_sax_parser_base.erlsrc index d7128dd4d0..a1305902d1 100644 --- a/lib/xmerl/src/xmerl_sax_parser_base.erlsrc +++ b/lib/xmerl/src/xmerl_sax_parser_base.erlsrc @@ -113,7 +113,7 @@ parse_dtd(Xml, State) -> try State1 = event_callback(startDocument, State), - Result = parse_external_entity_1(Xml, State1#xmerl_sax_parser_state{ref_table=RefTable}), + Result = parse_external_entity_1(Xml, State1#xmerl_sax_parser_state{ref_table=RefTable}, []), handle_end_document(Result) catch throw:Exception -> @@ -262,6 +262,8 @@ parse_xml_decl(?STRING_REST("<?xml", Rest1), State) -> parse_xml_decl_rest(Rest1, State); ?PARSE_XML_DECL(Bytes, State). +parse_xml_decl_rest(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_xml_decl_rest/2); parse_xml_decl_rest(?STRING_UNBOUND_REST(C, Rest) = Bytes, State) -> if ?is_whitespace(C) -> @@ -294,46 +296,72 @@ parse_text_decl(?STRING("<?xm") = Bytes, State) -> parse_text_decl(?STRING("<?xml") = Bytes, State) -> cf(Bytes, State, fun parse_text_decl/2); parse_text_decl(?STRING_REST("<?xml", Rest1), State) -> - parse_text_decl_rest(Rest1, State); + parse_text_decl_1(Rest1, State); parse_text_decl(Bytes, State) when is_binary(Bytes) -> {Bytes, State}. -parse_text_decl_rest(?STRING_EMPTY, State) -> - cf(?STRING_EMPTY, State, fun parse_text_decl_rest/2); -parse_text_decl_rest(?STRING("?") = Rest, State) -> - cf(Rest, State, fun parse_text_decl_rest/2); -parse_text_decl_rest(?STRING("v") = Rest, State) -> - cf(Rest, State, fun parse_text_decl_rest/2); -parse_text_decl_rest(?STRING("e") = Rest, State) -> - cf(Rest, State, fun parse_text_decl_rest/2); -parse_text_decl_rest(?STRING_UNBOUND_REST("?>", Rest) = _Bytes, State) -> - {Rest, State}; -parse_text_decl_rest(?STRING_UNBOUND_REST(C, _) = Rest, State) when ?is_whitespace(C) -> +parse_text_decl_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_text_decl_1/2); +parse_text_decl_1(?STRING("?") = Rest, State) -> + cf(Rest, State, fun parse_text_decl_1/2); +parse_text_decl_1(?STRING("v") = Rest, State) -> + cf(Rest, State, fun parse_text_decl_1/2); +parse_text_decl_1(?STRING("e") = Rest, State) -> + cf(Rest, State, fun parse_text_decl_2/2); +parse_text_decl_1(?STRING_UNBOUND_REST("?>", _Rest) = _Bytes, State) -> + ?fatal_error(State, "expecting attribute encoding"); +parse_text_decl_1(?STRING_UNBOUND_REST(C, _) = Rest, State) when ?is_whitespace(C) -> {_WS, Rest1, State1} = whitespace(Rest, State, []), - parse_text_decl_rest(Rest1, State1); -parse_text_decl_rest(?STRING_UNBOUND_REST("v", Rest) = _Bytes, State) -> + parse_text_decl_1(Rest1, State1); +parse_text_decl_1(?STRING_UNBOUND_REST("v", Rest) = _Bytes, State) -> case parse_name(Rest, State, [$v]) of {"version", Rest1, State1} -> {Rest2, State2} = parse_eq(Rest1, State1), {_Version, Rest3, State3} = parse_att_value(Rest2, State2), - parse_text_decl_rest(Rest3, State3); + parse_text_decl_2(Rest3, State3); {_, _, State1} -> ?fatal_error(State1, "expecting attribute version") end; -parse_text_decl_rest(?STRING_UNBOUND_REST("e", Rest) = _Bytes, State) -> +parse_text_decl_1(?STRING_UNBOUND_REST("e", _) = Bytes, State) -> + parse_text_decl_2(Bytes, State); +parse_text_decl_1(?STRING_UNBOUND_REST(_, _), State) -> + ?fatal_error(State, "expecting attribute encoding or version"); +parse_text_decl_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_text_decl_1/2], + "expecting attribute encoding or version"). + +parse_text_decl_2(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_text_decl_2/2); +parse_text_decl_2(?STRING("e") = Rest, State) -> + cf(Rest, State, fun parse_text_decl_2/2); +parse_text_decl_2(?STRING_UNBOUND_REST(C, _) = Rest, State) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_text_decl_2(Rest1, State1); +parse_text_decl_2(?STRING_UNBOUND_REST("e", Rest) = _Bytes, State) -> case parse_name(Rest, State, [$e]) of {"encoding", Rest1, State1} -> {Rest2, State2} = parse_eq(Rest1, State1), {_Version, Rest3, State3} = parse_att_value(Rest2, State2), - parse_text_decl_rest(Rest3, State3); + parse_text_decl_3(Rest3, State3); {_, _, State1} -> ?fatal_error(State1, "expecting attribute encoding") end; -parse_text_decl_rest(?STRING_UNBOUND_REST(_, _), State) -> - ?fatal_error(State, "expecting attribute encoding or version"); -parse_text_decl_rest(Bytes, State) -> - unicode_incomplete_check([Bytes, State, fun parse_text_decl_rest/2], - "expecting attribute encoding or version"). +parse_text_decl_2(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_text_decl_2/2], + "expecting attribute encoding"). + +parse_text_decl_3(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_text_decl_3/2); +parse_text_decl_3(?STRING("?") = Rest, State) -> + cf(Rest, State, fun parse_text_decl_3/2); +parse_text_decl_3(?STRING_UNBOUND_REST("?>", Rest) = _Bytes, State) -> + {Rest, State}; +parse_text_decl_3(?STRING_UNBOUND_REST(C, _) = Rest, State) when ?is_whitespace(C) -> + {_WS, Rest1, State1} = whitespace(Rest, State, []), + parse_text_decl_3(Rest1, State1); +parse_text_decl_3(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_text_decl_3/2], + "expecting ?>"). %%---------------------------------------------------------------------- %% Function: parse_prolog(Rest, State) -> Result @@ -381,6 +409,7 @@ parse_prolog_1(?STRING("DOCTYP") = Bytes, State) -> cf(Bytes, State, fun parse_prolog_1/2); parse_prolog_1(?STRING_REST("DOCTYPE", Rest), State) -> {Rest1, State1} = parse_doctype(Rest, State), + ok = check_ref_cycle(State1), State2 = event_callback(endDTD, State1), parse_prolog(Rest1, State2); parse_prolog_1(?STRING("-"), State) -> @@ -412,17 +441,31 @@ parse_version_info(?STRING_UNBOUND_REST(C, _) = Rest, State, Acc) when ?is_white parse_version_info(Rest1, State1, Acc); parse_version_info(?STRING_UNBOUND_REST(C,Rest), State, Acc) -> case is_name_start(C) of - true -> - case parse_name(Rest, State, [C]) of - {"version", Rest1, State1} -> - {Rest2, State2} = parse_eq(Rest1, State1), - {Version, Rest3, State3} = parse_att_value(Rest2, State2), - parse_xml_decl_rest(Rest3, State3, [{"version",Version}|Acc]); - {_, _, State1} -> - ?fatal_error(State1, "expecting attribute version") - end; - false -> - ?fatal_error(State, "expecting attribute version") + true -> + case parse_name(Rest, State, [C]) of + {"version", Rest1, State1} -> + {Rest2, State2} = parse_eq(Rest1, State1), + case parse_att_value(Rest2, State2) of + {"1." ++ SubVersion, Rest3, State3} -> + % any 1.N version is valid but will be handled as 1.0 + case lists:all(fun(D) when D >= $0, D =< $9 -> + true; + (_) -> + false + end, SubVersion) of + true -> + parse_xml_decl_rest(Rest3, State3, [{"version","1.0"}|Acc]); + false -> + ?fatal_error(State3, "unsupported version: 1." ++ SubVersion) + end; + {Version, _Rest3, State3} -> + ?fatal_error(State3, "unsupported version: " ++ Version) + end; + {_, _, State1} -> + ?fatal_error(State1, "expecting attribute version") + end; + false -> + ?fatal_error(State, "expecting attribute version") end; parse_version_info(Bytes, State, Acc) -> unicode_incomplete_check([Bytes, State, Acc, fun parse_version_info/3], @@ -485,6 +528,8 @@ parse_xml_decl_encoding(Bytes, State, Acc) -> undefined). +parse_xml_decl_encoding_1(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_xml_decl_encoding_1/3); parse_xml_decl_encoding_1(?STRING_UNBOUND_REST(C, _) = Bytes, State, Acc) when ?is_whitespace(C) -> {_WS, Rest1, State1} = whitespace(Bytes, State, []), parse_xml_decl_standalone(Rest1, State1, Acc); @@ -595,6 +640,8 @@ check_if_rest_ok(_, _) -> %%---------------------------------------------------------------------- parse_pi_1(?STRING_EMPTY, State) -> cf(?STRING_EMPTY, State, fun parse_pi_1/2); +parse_pi_1(?STRING("?") = Rest, State) -> + cf(Rest, State, fun parse_pi_1/2); parse_pi_1(?STRING_UNBOUND_REST(C,_) = Rest, State) when ?is_whitespace(C) -> {_WS, Rest1, State1} = whitespace(Rest, State, []), @@ -880,7 +927,8 @@ parse_attributes(?STRING_EMPTY, State, CurrentTag) -> cf(?STRING_EMPTY, State, CurrentTag, fun parse_attributes/3); parse_attributes(?STRING("/") = Bytes, State, CurrentTag) -> cf(Bytes, State, CurrentTag, fun parse_attributes/3); -parse_attributes(?STRING_REST("/>", Rest), State, {Tag, AttList, NewNsList}) -> +parse_attributes(?STRING_REST("/>", Rest), State, CurrentTag) -> + {Tag, AttList, NewNsList} = fill_default_attributes(CurrentTag, State), CompleteNsList = NewNsList ++ State#xmerl_sax_parser_state.ns, {Uri, LocalName, QName, Attributes} = fix_ns(Tag, AttList, CompleteNsList), State1 = send_start_prefix_mapping_event(lists:reverse(NewNsList), State), @@ -889,7 +937,8 @@ parse_attributes(?STRING_REST("/>", Rest), State, {Tag, AttList, NewNsList}) -> State4 = send_end_prefix_mapping_event(NewNsList, State3), parse_content(Rest, State4, [], true); parse_attributes(?STRING_REST(">", Rest), #xmerl_sax_parser_state{end_tags=ETags, ns = OldNsList} = State, - {Tag, AttList, NewNsList}) -> + CurrentTag) -> + {Tag, AttList, NewNsList} = fill_default_attributes(CurrentTag, State), CompleteNsList = NewNsList ++ OldNsList, {Uri, LocalName, QName, Attributes} = fix_ns(Tag, AttList, CompleteNsList), State1 = send_start_prefix_mapping_event(lists:reverse(NewNsList), State), @@ -910,13 +959,13 @@ parse_attributes(?STRING_UNBOUND_REST(C, Rest), State, {Tag, AttList, NsList}) - {AttValue, Rest3, State3} = parse_att_value(Rest2, State2), case AttrName of {"xmlns", NsName} -> - parse_attributes(Rest3, State3, {Tag, AttList, [{NsName, AttValue} |NsList]}); + parse_attributes_1(Rest3, State3, {Tag, AttList, [{NsName, AttValue} |NsList]}); {"", "xmlns"} -> - parse_attributes(Rest3, State3, {Tag, AttList, [{"", AttValue} |NsList]}); + parse_attributes_1(Rest3, State3, {Tag, AttList, [{"", AttValue} |NsList]}); {_Prefix, _LocalName} -> case lists:keyfind(AttrName, 1, AttList) of false -> - parse_attributes(Rest3, State3, {Tag, [{AttrName, AttValue}|AttList], NsList}); + parse_attributes_1(Rest3, State3, {Tag, [{AttrName, AttValue}|AttList], NsList}); _ -> ElName = case Tag of @@ -933,7 +982,59 @@ parse_attributes(Bytes, State, CurrentTag) -> unicode_incomplete_check([Bytes, State, CurrentTag, fun parse_attributes/3], "expecting name, whitespace, /> or >"). - +% check that the next character is valid +parse_attributes_1(?STRING_EMPTY, State, CurrentTag) -> + cf(?STRING_EMPTY, State, CurrentTag, fun parse_attributes_1/3); +parse_attributes_1(?STRING_UNBOUND_REST("/", _) = Bytes, State, CurrentTag) -> + parse_attributes(Bytes, State, CurrentTag); +parse_attributes_1(?STRING_UNBOUND_REST(">", _) = Bytes, State, CurrentTag) -> + parse_attributes(Bytes, State, CurrentTag); +parse_attributes_1(?STRING_UNBOUND_REST(C, _) = Bytes, State, CurrentTag) when ?is_whitespace(C) -> + parse_attributes(Bytes, State, CurrentTag); +parse_attributes_1(?STRING_UNBOUND_REST(C, _), State, _) -> + ?fatal_error(State, "Expecting whitespace, /> or >, got:" ++ [C]). + +fill_default_attributes(CurrentTag, #xmerl_sax_parser_state{attribute_values = []}) -> + CurrentTag; +fill_default_attributes({Tag, AttList, NsList}, #xmerl_sax_parser_state{attribute_values = Atts}) -> + F = fun({{E, A}, {V, normalize}}, {AttList1, NsList1}) when E == Tag -> + {merge_on_key({A, V}, AttList1), NsList1}; + ({_, ignore}, Acc) -> Acc; + ({{E, A}, V}, {AttList1, NsList1}) when E == Tag, V =/= normalize -> + case A of + {"xmlns", NsName} -> + {AttList1, merge_on_key({NsName, V}, NsList1)}; + {"", "xmlns"} -> + {AttList1, merge_on_key({"", V}, NsList1)}; + {_, _} -> + {merge_on_key({A, V}, AttList1), NsList1} + end; + (_, Acc) -> Acc + end, + {AttList2, NsList2} = lists:foldl(F, {AttList, NsList}, Atts), + % attribute names for values needing normalization + Norm = [A || + {{E, A}, V} <- Atts, + E == Tag, + V == normalize orelse element(2, V) == normalize], + N = fun({A, V}) -> + case lists:member(A, Norm) of + true -> + {A, lists:reverse(normalize_whitespace(V))}; + false -> + {A, V} + end + end, + AttList3 = lists:map(N, AttList2), + {Tag, AttList3, NsList2}. + +merge_on_key({Key, Value}, List) -> + case lists:keyfind(Key, 1, List) of + false -> + [{Key, Value}|List]; + _ -> + List + end. %%---------------------------------------------------------------------- %% Function: fix_ns({Prefix, Name}, Attributes, Ns) -> Result @@ -1093,24 +1194,26 @@ parse_att_value(?STRING_REST("\t", Rest), #xmerl_sax_parser_state{line_no=N} = S parse_att_value(?STRING_REST("&", Rest), State, Stop, Acc) -> {Ref, Rest1, State1} = parse_reference(Rest, State, true), case Ref of - {character, _, CharValue} -> - parse_att_value(Rest1, State1, Stop, [CharValue | Acc]); - {internal_general, true, _, Value} -> - parse_att_value(Rest1, State1, Stop, Value ++ Acc); - {internal_general, false, _, Value} -> - {ParsedValue, [], State2} = parse_att_value(?TO_INPUT_FORMAT(Value), State1, undefined, []), - parse_att_value(Rest1, State2, Stop, ParsedValue ++ Acc); - {external_general, Name, _} -> - ?fatal_error(State1, "External parsed entity reference in attribute value: " ++ Name); - {not_found, Name} -> - case State#xmerl_sax_parser_state.skip_external_dtd of - false -> - ?fatal_error(State1, "Entity not declared: " ++ Name); %%VC: Entity Declared - true -> - parse_att_value(Rest1, State1, Stop, ";" ++ lists:reverse(Name) ++ "&" ++ Acc) - end; - {unparsed, Name, _} -> - ?fatal_error(State1, "Unparsed entity reference in attribute value: " ++ Name) + {character, _, CharValue} -> + parse_att_value(Rest1, State1, Stop, [CharValue | Acc]); + {internal_general, true, _, [Stop]} -> % stop char in entity + parse_att_value(Rest1, State1, Stop, [Stop|Acc]); + {internal_general, true, _, Value} -> + IValue = ?TO_INPUT_FORMAT(Value), + parse_att_value(?APPEND_STRING(IValue, Rest1), State1, Stop, Acc); + {internal_general, _, _, Value} -> + IValue = ?TO_INPUT_FORMAT(Value), + {Ctx, State2} = strip_context(State1), + {Acc1, _, State3} = parse_entity_content(IValue, State2, Acc, normalize), + parse_att_value(Rest1, add_context_back(Ctx, State3), Stop, Acc1); + {external_general, Name, _} -> + ?fatal_error(State1, "External parsed entity reference in attribute value: " ++ Name); + {not_found, Name} when State#xmerl_sax_parser_state.file_type =:= normal -> + ?fatal_error(State1, "Undeclared reference: " ++ Name); + {not_found, Name} -> + parse_att_value(Rest1, State1, Stop, ";" ++ lists:reverse(Name) ++ "&" ++ Acc); + {unparsed, Name, _} -> + ?fatal_error(State1, "Unparsed entity reference in attribute value: " ++ Name) end; parse_att_value(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> {lists:reverse(Acc), Rest, State}; @@ -1168,6 +1271,8 @@ parse_etag(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_etag/2], undefined). +parse_etag_1(?STRING_EMPTY, State, Tag) -> + cf(?STRING_EMPTY, State, Tag, fun parse_etag_1/3); parse_etag_1(?STRING_REST(">", Rest), #xmerl_sax_parser_state{end_tags=[{_ETag, Uri, LocalName, QName, OldNsList, NewNsList} |RestOfETags], @@ -1200,38 +1305,37 @@ parse_etag_1(Bytes, State, Tag) -> %% Description: Parsing the content part of tags %% [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)* %%---------------------------------------------------------------------- -parse_content(?STRING_EMPTY, State, Acc, IgnorableWS) -> - case check_if_document_complete(State, "No more bytes") of - true -> - State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), - {?STRING_EMPTY, State1}; - false -> - case catch cf(?STRING_EMPTY, State, Acc, IgnorableWS, fun parse_content/4) of - {Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> - {Rest, State1}; - {fatal_error, {State1, Msg}} -> - case check_if_document_complete(State1, Msg) of - true -> - State2 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State1), - {?STRING_EMPTY, State2}; - false -> - ?fatal_error(State1, Msg) - end; - Other -> - throw(Other) - end +parse_content(?STRING_EMPTY, #xmerl_sax_parser_state{end_tags = ET} = State, Acc, IgnorableWS) -> + case catch cf(?STRING_EMPTY, State, Acc, IgnorableWS, fun parse_content/4) of + {fatal_error, {State1, "No more bytes"}} when ET == [] -> + State2 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State1), + {?STRING_EMPTY, State2}; + {fatal_error, {State1, "Continuation function undefined"}} when ET == [] -> + State2 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State1), + {?STRING_EMPTY, State2}; + {fatal_error, {State1, Msg}} -> + ?fatal_error(State1, Msg); + {Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> + {Rest, State1}; + Other -> + throw(Other) end; parse_content(?STRING("\r") = Bytes, State, Acc, IgnorableWS) -> cf(Bytes, State, Acc, IgnorableWS, fun parse_content/4); parse_content(?STRING("<") = Bytes, State, Acc, IgnorableWS) -> cf(Bytes, State, Acc, IgnorableWS, fun parse_content/4); -parse_content(?STRING_REST("</", Rest), State, Acc, IgnorableWS) -> - State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), - parse_etag(Rest, State1); -parse_content(?STRING("<!") = Bytes, State, _Acc, IgnorableWS) -> - cf(Bytes, State, [], IgnorableWS, fun parse_content/4); -parse_content(?STRING("<!-") = Bytes, State, _Acc, IgnorableWS) -> - cf(Bytes, State, [], IgnorableWS, fun parse_content/4); +parse_content(?STRING_REST("</", Rest), #xmerl_sax_parser_state{end_tags = ET} = State, Acc, IgnorableWS) -> + case ET of + [] -> + ?fatal_error(State, "Unbalanced tags"); + _ -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + parse_etag(Rest, State1) + end; +parse_content(?STRING("<!") = Bytes, State, Acc, IgnorableWS) -> + cf(Bytes, State, Acc, IgnorableWS, fun parse_content/4); +parse_content(?STRING("<!-") = Bytes, State, Acc, IgnorableWS) -> + cf(Bytes, State, Acc, IgnorableWS, fun parse_content/4); parse_content(?STRING_REST("<!--", Rest), State, Acc, IgnorableWS) -> State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), {Rest1, State2} = parse_comment(Rest, State1, []), @@ -1247,19 +1351,21 @@ parse_content(?STRING_REST("<?", Rest), State, Acc, IgnorableWS) -> end; parse_content(?STRING_REST("<!", Rest1) = Rest, #xmerl_sax_parser_state{end_tags = ET} = State, Acc, IgnorableWS) -> case ET of - [] -> - {Rest, State}; %% Skicka ignorable WS ??? - _ -> - State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), - parse_cdata(Rest1, State1) + [] -> + IValue = ?TO_INPUT_FORMAT(lists:reverse(Acc)), + {?APPEND_STRING(IValue, Rest), State}; + _ -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + parse_cdata(Rest1, State1) end; parse_content(?STRING_REST("<", Rest1) = Rest, #xmerl_sax_parser_state{end_tags = ET} = State, Acc, IgnorableWS) -> case ET of - [] -> - {Rest, State}; %% Skicka ignorable WS ??? - _ -> - State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), - parse_stag(Rest1, State1) + [] -> + IValue = ?TO_INPUT_FORMAT(lists:reverse(Acc)), + {?APPEND_STRING(IValue, Rest), State}; + _ -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + parse_stag(Rest1, State1) end; parse_content(?STRING_REST("\n", Rest), State, Acc, IgnorableWS) -> N = State#xmerl_sax_parser_state.line_no, @@ -1274,34 +1380,50 @@ parse_content(?STRING_REST(" ", Rest), State, Acc, IgnorableWS) -> parse_content(Rest, State,[?space |Acc], IgnorableWS); parse_content(?STRING_REST("\t", Rest), State, Acc, IgnorableWS) -> parse_content(Rest, State,[?tab |Acc], IgnorableWS); +parse_content(?STRING("]") = Bytes, State, Acc, IgnorableWS) -> + cf(Bytes, State, Acc, IgnorableWS, fun parse_content/4); +parse_content(?STRING("]]") = Bytes, State, Acc, IgnorableWS) -> + cf(Bytes, State, Acc, IgnorableWS, fun parse_content/4); parse_content(?STRING_REST("]]>", _Rest), State, _Acc, _IgnorableWS) -> ?fatal_error(State, "\"]]>\" is not allowed in content"); parse_content(?STRING_UNBOUND_REST(_C, _) = Rest, #xmerl_sax_parser_state{end_tags = []} = State, - _Acc, _IgnorableWS) -> - {Rest, State}; -parse_content(?STRING_REST("&", Rest), State, Acc, _IgnorableWS) -> + Acc, _IgnorableWS) -> + IValue = ?TO_INPUT_FORMAT(lists:reverse(Acc)), + {?APPEND_STRING(IValue, Rest), State}; +parse_content(?STRING_REST("&", Rest), #xmerl_sax_parser_state{file_type = Type} = State, Acc, IgnorableWS) -> {Ref, Rest1, State1} = parse_reference(Rest, State, true), case Ref of - {character, _, CharValue} -> - parse_content(Rest1, State1, [CharValue | Acc], false); - {internal_general, true, _, Value} -> - parse_content(Rest1, State1, Value ++ Acc, false); - {internal_general, false, _, Value} -> - IValue = ?TO_INPUT_FORMAT(Value), - parse_content(?APPEND_STRING(IValue, Rest1), State1, Acc, false); - {external_general, _, {PubId, SysId}} -> - State2 = parse_external_entity(State1, PubId, SysId), - parse_content(Rest1, State2, Acc, false); - {not_found, Name} -> - case State#xmerl_sax_parser_state.skip_external_dtd of - false -> - ?fatal_error(State1, "Entity not declared: " ++ Name); %%VC: Entity Declared - true -> - parse_content(Rest1, State1, ";" ++ lists:reverse(Name) ++ "&" ++ Acc, false) - end; - {unparsed, Name, _} -> - ?fatal_error(State1, "Unparsed entity reference in content: " ++ Name) + {character, _, CharValue} -> + parse_content(Rest1, State1, [CharValue | Acc], false); + % & causes problems with references + {internal_general, true, _, "&"} -> + ?fatal_error(State1, "Reference must begin and end in same entity"); + {internal_general, true, _, Value} -> + IValue = ?TO_INPUT_FORMAT(Value), + parse_content(?APPEND_STRING(IValue, Rest1), State1, Acc, false); + {internal_general, _, _, Value} -> + IValue = ?TO_INPUT_FORMAT(Value), + {Ctx, State2} = strip_context(State1), + % markup must be self contained + case parse_entity_content(IValue, State2, Acc, IgnorableWS) of + {fatal_error, {State3, Message}} -> + ?fatal_error(State3, Message); + {Acc1, _, State3} -> + parse_content(Rest1, add_context_back(Ctx, State3), Acc1, false) + end; + {external_general, _, {PubId, SysId}} -> + {Acc1, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId, Acc), + parse_content(Rest1, State2#xmerl_sax_parser_state{file_type = Type}, Acc1, false); + {not_found, Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + ?fatal_error(State1, "Entity not declared: " ++ Name); %%VC: Entity Declared + true -> + parse_content(Rest1, State1, ";" ++ lists:reverse(Name) ++ "&" ++ Acc, false) + end; + {unparsed, Name, _} -> + ?fatal_error(State1, "Unparsed entity reference in content: " ++ Name) end; parse_content(?STRING_UNBOUND_REST(C, Rest), State, Acc, _IgnorableWS) -> if @@ -1319,67 +1441,141 @@ parse_content(Bytes, State, Acc, IgnorableWS) -> %% Parameters: Rest = string() | binary() %% State = #xmerl_sax_parser_state{} %% Acc = string() -%% IgnorableWS = true | false -%% Result : {Rest, State} +%% IgnorableWS = true | false | normalize +%% Result : {Acc, Rest, State} %% Description: Parsing the content part of an external entity %% [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)* %%---------------------------------------------------------------------- +parse_entity_content(Bytes, #xmerl_sax_parser_state{file_type = text} = State, Acc, _IgnorableWS) -> + parse_entity_content_1(Bytes, State, Acc); parse_entity_content(?STRING_EMPTY, State, Acc, IgnorableWS) -> - parse_content(?STRING_EMPTY, State, Acc, IgnorableWS); -parse_entity_content(?STRING_REST("<", Rest1), State, Acc, IgnorableWS) -> + case catch cf(?STRING_EMPTY, State, Acc, IgnorableWS, fun parse_entity_content/4) of + {Acc1, Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> + {Acc1, Rest, State1}; + {fatal_error, {State1, "No more bytes"}} -> + {Acc, ?STRING_EMPTY, State1}; + {fatal_error, {State1, "Continuation function undefined"}} -> + {Acc, ?STRING_EMPTY, State1}; + {fatal_error, {State1, Message}} -> + ?fatal_error(State1, Message) + end; +parse_entity_content(?STRING("<") = Bytes, State, Acc, IgnorableWS) -> + cf(Bytes, State, Acc, IgnorableWS, fun parse_entity_content/4); +parse_entity_content(?STRING("<!") = Bytes, State, Acc, IgnorableWS) -> + cf(Bytes, State, Acc, IgnorableWS, fun parse_entity_content/4); +parse_entity_content(?STRING("<!-") = Bytes, State, Acc, IgnorableWS) -> + cf(Bytes, State, Acc, IgnorableWS, fun parse_entity_content/4); +parse_entity_content(?STRING_REST("<!--", Rest), State, Acc, IgnorableWS) -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + case catch parse_comment(Rest, State1, Acc) of + {Rest1, State2} when is_record(State2, xmerl_sax_parser_state) -> + parse_entity_content(Rest1, State2, [], true); + {fatal_error, {State2, "No more bytes"}} -> + ?fatal_error(State2, "Expected end comment"); + {fatal_error, {State2, Message}} -> + ?fatal_error(State2, Message) + end; +parse_entity_content(?STRING_REST("<?", Rest), State, Acc, IgnorableWS) -> State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), - case parse_stag(Rest1, State1) of + case parse_pi(Rest, State1) of + {Rest1, State2} -> + parse_entity_content(Rest1, State2, [], true); + {endDocument, _Rest1, State2} -> + IValue = ?TO_INPUT_FORMAT("<?"), + {[],?APPEND_STRING(IValue, Rest), State2} + end; +parse_entity_content(?STRING_REST("</", _), #xmerl_sax_parser_state{end_tags = []} = State, _, _)-> + ?fatal_error(State, "Unbalanced tags"); +parse_entity_content(?STRING_REST("</", Rest1), State, Acc, IgnorableWS) -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + case parse_etag(Rest1, State1) of {?STRING_EMPTY, State2} -> - {?STRING_EMPTY, State2}; + {[], ?STRING_EMPTY, State2}; + {Rest2, State2} when is_record(State2, xmerl_sax_parser_state) -> + parse_entity_content(Rest2, State2, [], true); + {fatal_error, {State2, Message}} -> + ?fatal_error(State2, Message) + end; +parse_entity_content(?STRING_REST("<!", Rest1), State, Acc, IgnorableWS) -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + case parse_cdata(Rest1, State1) of + {?STRING_EMPTY, State2} -> + {[], ?STRING_EMPTY, State2}; {Rest2, State2} when is_record(State2, xmerl_sax_parser_state) -> parse_entity_content(Rest2, State2, [], true); Other -> Other end; +parse_entity_content(?STRING_REST("<", Rest1), State, Acc, IgnorableWS) -> + State1 = send_character_event(length(Acc), IgnorableWS, lists:reverse(Acc), State), + case catch parse_stag(Rest1, State1) of + {Rest2, State2} when is_record(State2, xmerl_sax_parser_state) -> + parse_entity_content(Rest2, State2, [], true); + {fatal_error, {State2, Message}} -> + ?fatal_error(State2, Message) + end; parse_entity_content(?STRING_REST("\n", Rest), State, Acc, IgnorableWS) -> N = State#xmerl_sax_parser_state.line_no, - parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS); -parse_entity_content(?STRING_REST("\r\n", Rest), State, Acc, IgnorableWS) -> + case IgnorableWS of + normalize -> + parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?space |Acc], IgnorableWS); + _ -> + parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS) + end; +parse_entity_content(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{file_type = entity} = State, Acc, IgnorableWS) -> N = State#xmerl_sax_parser_state.line_no, - parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS); + case IgnorableWS of + normalize -> + parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?space |Acc], IgnorableWS); + _ -> + parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS) + end; parse_entity_content(?STRING_REST("\r", Rest), State, Acc, IgnorableWS) -> N = State#xmerl_sax_parser_state.line_no, - parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS); + case IgnorableWS of + normalize -> + parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?space |Acc], IgnorableWS); + % only external entities are end-of-line normalized + _ when State#xmerl_sax_parser_state.file_type == normal -> + parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?cr |Acc], IgnorableWS); + _ -> + parse_entity_content(Rest, State#xmerl_sax_parser_state{line_no=N+1},[?lf |Acc], IgnorableWS) + end; parse_entity_content(?STRING_REST(" ", Rest), State, Acc, IgnorableWS) -> parse_entity_content(Rest, State,[?space |Acc], IgnorableWS); parse_entity_content(?STRING_REST("\t", Rest), State, Acc, IgnorableWS) -> parse_entity_content(Rest, State,[?tab |Acc], IgnorableWS); -parse_entity_content(?STRING_REST("&", Rest), State, Acc, _IgnorableWS) -> +parse_entity_content(?STRING_REST("&", Rest), #xmerl_sax_parser_state{file_type = Type} = State, Acc, IgnorableWS) -> {Ref, Rest1, State1} = parse_reference(Rest, State, true), + ok = check_ref_cycle(State1), case Ref of {character, _, CharValue} -> parse_entity_content(Rest1, State1, [CharValue | Acc], false); {internal_general, true, _, Value} -> - parse_entity_content(Rest1, State1, Value ++ Acc, false); + IValue = ?TO_INPUT_FORMAT(Value), + parse_entity_content(?APPEND_STRING(IValue, Rest1), State1, Acc, false); {internal_general, false, _, Value} -> IValue = ?TO_INPUT_FORMAT(Value), - parse_entity_content(?APPEND_STRING(IValue, Rest1), State1, Acc, false); + ET = State1#xmerl_sax_parser_state.end_tags, + {Acc1, _, State2} = parse_entity_content(IValue, State1#xmerl_sax_parser_state{end_tags = []}, Acc, IgnorableWS), + parse_entity_content(Rest1, State2#xmerl_sax_parser_state{end_tags = ET}, Acc1, false); {external_general, _, {PubId, SysId}} -> - State2 = parse_external_entity(State1, PubId, SysId), - parse_entity_content(Rest1, State2, Acc, false); + %?fatal_error(State1, "External reference in entity: " ++ Name); + {Acc1, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId, Acc), + parse_entity_content(Rest1, State2#xmerl_sax_parser_state{file_type = Type}, Acc1, false); {not_found, Name} -> - case State#xmerl_sax_parser_state.skip_external_dtd of - false -> - ?fatal_error(State1, "Entity not declared: " ++ Name); %%VC: Entity Declared - true -> - parse_entity_content(Rest1, State1, ";" ++ lists:reverse(Name) ++ "&" ++ Acc, false) - end; + ?fatal_error(State1, "Entity not declared: " ++ Name); {unparsed, Name, _} -> ?fatal_error(State1, "Unparsed entity reference in content: " ++ Name) end; parse_entity_content(?STRING_UNBOUND_REST(C, Rest), State, Acc, _IgnorableWS) -> if ?is_char(C) -> - case parse_content(Rest, State, [C|Acc], false) of - {?STRING_EMPTY, State1} -> - {?STRING_EMPTY, State1}; - {Rest1, State1} when is_record(State1, xmerl_sax_parser_state) -> - parse_entity_content(Rest1, State1, [], true); + case parse_entity_content(Rest, State, [C|Acc], false) of + {Acc1, ?STRING_EMPTY, State1} -> + {Acc1, ?STRING_EMPTY, State1}; + {Acc1, Rest1, State1} when is_record(State1, xmerl_sax_parser_state) -> + parse_entity_content(Rest1, State1, Acc1, true); Other -> Other end; @@ -1388,23 +1584,28 @@ parse_entity_content(?STRING_UNBOUND_REST(C, Rest), State, Acc, _IgnorableWS) -> end; parse_entity_content(Bytes, State, Acc, IgnorableWS) -> unicode_incomplete_check([Bytes, State, Acc, IgnorableWS, fun parse_entity_content/4], - undefined). - -%%---------------------------------------------------------------------- -%% Function: check_if_document_complete(State, ErrorMsg) -> Result -%% Parameters: State = #xmerl_sax_parser_state{} -%% ErrorMsg = string() -%% Result : boolean() -%% Description: Checks that the document is complete if we don't have more data.. -%%---------------------------------------------------------------------- -check_if_document_complete(#xmerl_sax_parser_state{end_tags = []}, - "No more bytes") -> - true; -check_if_document_complete(#xmerl_sax_parser_state{end_tags = []}, - "Continuation function undefined") -> - true; -check_if_document_complete(_, _) -> - false. + "Unexpected end of entity content"). + +% reads an external entity as replacement text +parse_entity_content_1(?STRING_EMPTY, State, Acc) -> + case catch cf(?STRING_EMPTY, State, Acc, fun parse_entity_content_1/3) of + {fatal_error, {State1, "No more bytes"}} -> + {Acc, ?STRING_EMPTY, State1}; + {fatal_error, {State1, Message}} -> + ?fatal_error(State1, Message); + {Acc1, ?STRING_EMPTY, State1} -> + {Acc1, ?STRING_EMPTY, State1} + end; +parse_entity_content_1(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> + if + ?is_char(C) -> + parse_entity_content_1(Rest, State, [C|Acc]); + true -> + ?fatal_error(State, lists:flatten(io_lib:format("Bad character in entity: ~p", [C]))) + end; +parse_entity_content_1(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_entity_content_1/3], + "Unexpected end of entity content"). %%---------------------------------------------------------------------- %% Function: send_character_event(Length, IgnorableWS, String, State) -> Result @@ -1506,6 +1707,8 @@ parse_reference(Bytes, State, HaveToExist) -> undefined). +parse_reference_1(?STRING_EMPTY, State, HaveToExist, Name) -> + cf(?STRING_EMPTY, State, HaveToExist, Name, fun parse_reference_1/4); parse_reference_1(?STRING_REST(";", Rest), State, HaveToExist, Name) -> case look_up_reference(Name, HaveToExist, State) of {internal_general, Name, RefValue} -> @@ -1537,8 +1740,12 @@ is_delimiter(34) -> true; is_delimiter("&") -> true; +is_delimiter("&") -> + true; is_delimiter("<") -> true; +is_delimiter("<") -> + true; is_delimiter(">") -> true; is_delimiter("'") -> @@ -1572,6 +1779,8 @@ parse_pe_reference(Bytes, State) -> undefined). +parse_pe_reference_1(?STRING_EMPTY, State, Name) -> + cf(?STRING_EMPTY, State, Name, fun parse_pe_reference_1/3); parse_pe_reference_1(?STRING_REST(";", Rest), State, Name) -> Name1 = "%" ++ Name, Result = look_up_reference(Name1, true, State), @@ -1600,15 +1809,15 @@ insert_reference(Name, Value, #xmerl_sax_parser_state{ref_table = Map} = State) %%---------------------------------------------------------------------- -%% Function: look_up_reference(Reference, State) -> Result +%% Function: look_up_reference(Reference, HaveToExist, State) -> Result %% Parameters: Reference = string() %% State = #xmerl_sax_parser_state{} %% Result : %%---------------------------------------------------------------------- look_up_reference("amp", _, _) -> - {internal_general, "amp", "&"}; + {internal_general, "amp", "&"}; look_up_reference("lt", _, _) -> - {internal_general, "lt", "<"}; + {internal_general, "lt", "<"}; look_up_reference("gt", _, _) -> {internal_general, "gt", ">"}; look_up_reference("apos", _, _) -> @@ -1683,7 +1892,7 @@ parse_digit(Bytes, State, Acc) -> undefined). %%---------------------------------------------------------------------- -%% Function: parse_system_litteral(Rest, State, Stop, Acc) -> Result +%% Function: parse_system_literal(Rest, State, Stop, Acc) -> Result %% Parameters: Rest = string() | binary() %% State = #xmerl_sax_parser_state{} %% Stop = $' | $" @@ -1691,21 +1900,23 @@ parse_digit(Bytes, State, Acc) -> %% Result : {Value, Reference, Rest, State} %% Value = integer() %% Reference = string() -%% Description: Parse a system litteral. +%% Description: Parse a system literal. %% [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") %%---------------------------------------------------------------------- -parse_system_litteral(?STRING_EMPTY, State, Stop, Acc) -> - cf(?STRING_EMPTY, State, Stop, Acc, fun parse_system_litteral/4); -parse_system_litteral(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> +parse_system_literal(?STRING_EMPTY, State, Stop, Acc) -> + cf(?STRING_EMPTY, State, Stop, Acc, fun parse_system_literal/4); +parse_system_literal(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> {lists:reverse(Acc), Rest, State}; -parse_system_litteral(?STRING_UNBOUND_REST(C, Rest), State, Stop, Acc) -> - parse_system_litteral(Rest, State, Stop, [C |Acc]); -parse_system_litteral(Bytes, State, Stop, Acc) -> - unicode_incomplete_check([Bytes, State, Stop, Acc, fun parse_system_litteral/4], +parse_system_literal(?STRING_UNBOUND_REST("#", _), State, _, _) -> + ?fatal_error(State, "Fragment found in system identifier"); +parse_system_literal(?STRING_UNBOUND_REST(C, Rest), State, Stop, Acc) -> + parse_system_literal(Rest, State, Stop, [C |Acc]); +parse_system_literal(Bytes, State, Stop, Acc) -> + unicode_incomplete_check([Bytes, State, Stop, Acc, fun parse_system_literal/4], undefined). %%---------------------------------------------------------------------- -%% Function: parse_pubid_litteral(Rest, State, Stop, Acc) -> Result +%% Function: parse_pubid_literal(Rest, State, Stop, Acc) -> Result %% Parameters: Rest = string() | binary() %% State = #xmerl_sax_parser_state{} %% Stop = $' | $" @@ -1713,24 +1924,42 @@ parse_system_litteral(Bytes, State, Stop, Acc) -> %% Result : {Value, Reference, Rest, State} %% Value = integer() %% Reference = string() -%% Description: Parse a public idlitteral. +%% Description: Parse a public idliteral. %% [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" %%---------------------------------------------------------------------- -parse_pubid_litteral(?STRING_EMPTY, State, Stop, Acc) -> - cf(?STRING_EMPTY, State, Stop, Acc, fun parse_pubid_litteral/4); -parse_pubid_litteral(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> - {lists:reverse(Acc), Rest, State}; -parse_pubid_litteral(?STRING_UNBOUND_REST(C, Rest), State, Stop, Acc) -> +parse_pubid_literal(?STRING_EMPTY, State, Stop, Acc) -> + cf(?STRING_EMPTY, State, Stop, Acc, fun parse_pubid_literal/4); +parse_pubid_literal(?STRING_UNBOUND_REST(Stop, Rest), State, Stop, Acc) -> + {normalize_whitespace(Acc), Rest, State}; +parse_pubid_literal(?STRING_UNBOUND_REST(C, Rest), State, Stop, Acc) -> case is_pubid_char(C) of true -> - parse_pubid_litteral(Rest, State, Stop, [C |Acc]); + parse_pubid_literal(Rest, State, Stop, [C |Acc]); false -> - ?fatal_error(State, "Character not allowed in pubid litteral: " ++ [C]) + ?fatal_error(State, "Character not allowed in pubid literal: " ++ [C]) end; -parse_pubid_litteral(Bytes, State, Stop, Acc) -> - unicode_incomplete_check([Bytes, State, Stop, Acc, fun parse_pubid_litteral/4], +parse_pubid_literal(Bytes, State, Stop, Acc) -> + unicode_incomplete_check([Bytes, State, Stop, Acc, fun parse_pubid_literal/4], undefined). +% returns a reversed, normalized version of the string +normalize_whitespace(Acc) -> + T1 = delete_leading_whitespace(Acc), + T2 = normalize_whitespace(T1, []), + delete_leading_whitespace(T2). + +-define(is_ws(C), C =:= ?space orelse C =:= ?cr orelse C =:= ?lf orelse C =:= ?tab). + +normalize_whitespace([W1,W2|T], Acc) when ?is_ws(W1), + ?is_ws(W2) -> + normalize_whitespace([$ |T], Acc); +normalize_whitespace([W|T], Acc) when ?is_ws(W) -> + normalize_whitespace(T, [$ |Acc]); +normalize_whitespace([W|T], Acc) -> + normalize_whitespace(T, [W|Acc]); +normalize_whitespace([], Acc) -> + Acc. + %%====================================================================== %% DTD Parsing %%====================================================================== @@ -1829,19 +2058,23 @@ parse_doctype_1(?STRING_UNBOUND_REST(C, _) = Rest, State, Name, Definition) when parse_doctype_1(?STRING_UNBOUND_REST(C, _) = Rest, State, Name, _Definition) when C == $S; C == $P -> {PubId, SysId, Rest1, State1} = parse_external_id(Rest, State, false), State2 = event_callback({startDTD, Name, PubId, SysId}, State1), - State3 = - case State2#xmerl_sax_parser_state.skip_external_dtd of - false -> - parse_external_entity(State2#xmerl_sax_parser_state{file_type=dtd}, PubId, SysId); - true -> - State2 - end, - parse_doctype_1(Rest1, State3, Name, true); + {Rest2, State3} = parse_doctype_1(Rest1, State2, Name, true), + % external subsets are parsed after internal + case State2#xmerl_sax_parser_state.skip_external_dtd of + false -> + FT = State3#xmerl_sax_parser_state.file_type, + {_, State4} = parse_external_entity(State3#xmerl_sax_parser_state{file_type=dtd}, PubId, SysId, []), + {Rest2, State4#xmerl_sax_parser_state{file_type = FT}}; + true -> + {Rest2, State3} + end; parse_doctype_1(Bytes, State, Name, Definition) -> unicode_incomplete_check([Bytes, State, Name, Definition, fun parse_doctype_1/4], "expecting >, external id or declaration part"). +parse_doctype_2(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_doctype_2/2); parse_doctype_2(?STRING_REST(">", Rest), State) -> {Rest, State}; parse_doctype_2(Bytes, State) -> @@ -1850,62 +2083,70 @@ parse_doctype_2(Bytes, State) -> %%---------------------------------------------------------------------- -%% Function : parse_external_entity(State, PubId, SysId) -> Result +%% Function : parse_external_entity(State, PubId, SysId, Acc) -> Result %% Parameters: State = #xmerl_sax_parser_state{} %% PubId = string() %% SysId = string() -%% Result : {Rest, State} +%% Result : {Acc, State} %% Description: Starts the parsing of an external entity by calling the resolver and %% then sends the input to the parsing function. %%---------------------------------------------------------------------- %% The public id is not handled -parse_external_entity(State, _PubId, SysId) -> +parse_external_entity(State, _PubId, SysId, Acc) -> ExtRef = check_uri(SysId, State#xmerl_sax_parser_state.current_location), SaveState = event_callback({startEntity, SysId}, State), State1 = State#xmerl_sax_parser_state{line_no=1, - continuation_state=undefined, - continuation_fun=fun xmerl_sax_parser:default_continuation_cb/1, - end_tags = []}, + end_tags = []}, - - {EventState, RefTable} = handle_external_entity(ExtRef, State1), + {Acc1, EventState, EventRefTab, AttVals} = handle_external_entity(ExtRef, State1, Acc), NewState = event_callback({endEntity, SysId}, - SaveState#xmerl_sax_parser_state{event_state=EventState, - ref_table=RefTable}), - NewState#xmerl_sax_parser_state{file_type=normal}. + SaveState#xmerl_sax_parser_state{event_state=EventState}), + case SaveState#xmerl_sax_parser_state.standalone of + no -> + {Acc1, NewState#xmerl_sax_parser_state{ref_table = EventRefTab, + attribute_values = AttVals}}; + yes -> + {Acc1, NewState#xmerl_sax_parser_state{attribute_values = AttVals}} + end. %%---------------------------------------------------------------------- -%% Function : handle_external_entity(ExtRef, State) -> Result +%% Function : handle_external_entity(ExtRef, State, Acc) -> Result %% Parameters: ExtRef = {file, string()} | {http, string()} %% State = #xmerl_sax_parser_state{} -%% Result : string() | binary() +%% Result : {Acc, State} %% Description: Returns working directory, entity and the opened %% filedescriptor. %%---------------------------------------------------------------------- -handle_external_entity({file, FileToOpen}, State) -> +handle_external_entity({file, FileToOpen}, #xmerl_sax_parser_state{encoding = Enc} = State, Acc) -> case file:open(FileToOpen, [raw, read, binary]) of {error, Reason} -> ?fatal_error(State, "Couldn't open external entity "++ FileToOpen ++ " : " ++ file:format_error(Reason)); {ok, FD} -> - {?STRING_EMPTY, EntityState} = - parse_external_entity_1(<<>>, - State#xmerl_sax_parser_state{continuation_state=FD, - current_location=filename:dirname(FileToOpen), - entity=filename:basename(FileToOpen), - input_type=file}), - ok = file:close(FD), - {EntityState#xmerl_sax_parser_state.event_state, - EntityState#xmerl_sax_parser_state.ref_table} - end; -handle_external_entity({http, Url}, State) -> + State1 = State#xmerl_sax_parser_state{continuation_state={FD, <<>>}, + continuation_fun = fun external_continuation_cb/1, + current_location=filename:dirname(FileToOpen), + entity=filename:basename(FileToOpen), + input_type=file}, + {Head, #xmerl_sax_parser_state{encoding = Enc1} = State2} = detect_charset(State1), + {Head1, State3} = encode_external_input(Head, Enc1, Enc, State2), + ConFun = external_continuation_cb(Enc1, Enc), + {Acc1, ?STRING_EMPTY, EntityState} = + parse_external_entity_1(Head1, State3#xmerl_sax_parser_state{continuation_fun = ConFun}, Acc), + ok = file:close(FD), + {Acc1, + EntityState#xmerl_sax_parser_state.event_state, + EntityState#xmerl_sax_parser_state.ref_table, + EntityState#xmerl_sax_parser_state.attribute_values} + end; +handle_external_entity({http, Url}, #xmerl_sax_parser_state{encoding = Enc} = State, Acc) -> try {Host, Port, Key} = http(Url), @@ -1915,92 +2156,110 @@ handle_external_entity({http, Url}, State) -> ?fatal_error(State, "Couldn't open temporary file " ++ TmpFile ++ " : " ++ file:format_error(Reason)); {ok, FD} -> - {?STRING_EMPTY, EntityState} = - parse_external_entity_byte_order_mark(<<>>, - State#xmerl_sax_parser_state{continuation_state=FD, - current_location=filename:dirname(Url), - entity=filename:basename(Url), - input_type=file}), - ok = file:close(FD), - ok = file:delete(TmpFile), - {EntityState#xmerl_sax_parser_state.event_state, - EntityState#xmerl_sax_parser_state.ref_table} - + State1 = State#xmerl_sax_parser_state{continuation_state={FD, <<>>}, + continuation_fun = fun external_continuation_cb/1, + current_location=filename:dirname(Url), + entity=filename:basename(Url), + input_type=file}, + {Head, #xmerl_sax_parser_state{encoding = Enc1} = State2} = detect_charset(State1), + ConFun = external_continuation_cb(Enc1, Enc), + {Acc1, ?STRING_EMPTY, EntityState} = + parse_external_entity_1(Head, State2#xmerl_sax_parser_state{continuation_fun = ConFun}, Acc), + ok = file:close(FD), + ok = file:delete(TmpFile), + {Acc1, + EntityState#xmerl_sax_parser_state.event_state, + EntityState#xmerl_sax_parser_state.ref_table, + EntityState#xmerl_sax_parser_state.attribute_values} end catch throw:{error, Error} -> ?fatal_error(State, Error) end; -handle_external_entity({Tag, _Url}, State) -> +handle_external_entity({Tag, _Url}, State, _Acc) -> ?fatal_error(State, "Unsupported URI type: " ++ atom_to_list(Tag)). -?PARSE_EXTERNAL_ENTITY_BYTE_ORDER_MARK(Bytes, State). +%%?PARSE_EXTERNAL_ENTITY_BYTE_ORDER_MARK(Bytes, State). %%---------------------------------------------------------------------- -%% Function : parse_external_entity_1(Rest, State) -> Result +%% Function : parse_external_entity_1(Rest, State, Acc) -> Result %% Parameters: Rest = string() | binary() %% State = #xmerl_sax_parser_state{} -%% Result : {Rest, State} +%% Result : {Acc, Rest, State} %% Description: Parse the external entity. %%---------------------------------------------------------------------- -parse_external_entity_1(?STRING_EMPTY, State) -> - case catch cf(?STRING_EMPTY, State, fun parse_external_entity_1/2) of - {Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> - {Rest, State1}; - {fatal_error, {State1, "No more bytes"}} -> - {?STRING_EMPTY, State1}; - Other -> - throw(Other) - end; -parse_external_entity_1(?STRING("<") = Bytes, State) -> - cf(Bytes, State, fun parse_external_entity_1/2); -parse_external_entity_1(?STRING("<?") = Bytes, State) -> - cf(Bytes, State, fun parse_external_entity_1/2); -parse_external_entity_1(?STRING("<?x") = Bytes, State) -> - cf(Bytes, State, fun parse_external_entity_1/2); -parse_external_entity_1(?STRING("<?xm") = Bytes, State) -> - cf(Bytes, State, fun parse_external_entity_1/2); -parse_external_entity_1(?STRING("<?xml") = Bytes, State) -> - cf(Bytes, State, fun parse_external_entity_1/2); +parse_external_entity_1(?STRING_EMPTY, State, Acc) -> + case catch cf(?STRING_EMPTY, State, Acc, fun parse_external_entity_1/3) of + {fatal_error, {State1, "No more bytes"}} -> + {Acc, ?STRING_EMPTY, State1}; + {fatal_error, {State1, Msg}} -> + ?fatal_error(State1, Msg); + {Acc1, ?STRING_EMPTY, State1} when is_record(State1, xmerl_sax_parser_state) -> + {Acc1, ?STRING_EMPTY, State1}; + {_, _, State1} when is_record(State1, xmerl_sax_parser_state) -> + ?fatal_error(State1, "Not well-formed entity") + end; +parse_external_entity_1(?STRING("<") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_external_entity_1/3); +parse_external_entity_1(?STRING("<?") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_external_entity_1/3); +parse_external_entity_1(?STRING("<?x") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_external_entity_1/3); +parse_external_entity_1(?STRING("<?xm") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_external_entity_1/3); +parse_external_entity_1(?STRING("<?xml") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_external_entity_1/3); parse_external_entity_1(?STRING_REST("<?xml", Rest) = Bytes, - #xmerl_sax_parser_state{file_type=Type} = State) -> - {Rest1, State1} = - case is_next_char_whitespace(Rest, State) of - false -> - {Bytes, State}; - true when Type =:= dtd -> - {_XmlAttributes, R, S} = parse_version_info(Rest, State, []), - %S1 = event_callback({processingInstruction, "xml", XmlAttributes}, S),% The XML decl. should not be reported as a PI - {R, S}; - true -> - parse_text_decl(Bytes, State) - end, + #xmerl_sax_parser_state{file_type=Type, + end_tags = ET} = State, Acc) -> + {Rest1, State1} = + case is_next_char_whitespace(Rest, State) of + false -> + {Bytes, State}; + true -> + parse_text_decl(Bytes, State) + end, case Type of - dtd -> - case catch parse_doctype_decl(Rest1, State1) of - {Rest2, State2} when is_record(State2, xmerl_sax_parser_state) -> - {Rest2, State2}; - {fatal_error, {State2, "No more bytes"}} -> - {?STRING_EMPTY, State2}; - Other -> - throw(Other) - end; - - _ -> % Type is normal or entity - parse_entity_content(Rest1, State1, [], true) + dtd -> + case catch parse_doctype_decl(Rest1, State1) of + {?STRING_EMPTY, State2} when is_record(State2, xmerl_sax_parser_state) -> + % this my not truly be empty. the file may have + % more unbalanced stuff, but not have been read yet + {[], ?STRING_EMPTY, State2}; + {_, State1} when is_record(State1, xmerl_sax_parser_state) -> + ?fatal_error(State1, "Not well-formed DTD"); + {fatal_error, {State2, "No more bytes"}} -> + {[], ?STRING_EMPTY, State2}; + {fatal_error, {State2, Message}} -> + ?fatal_error(State2, Message) + end; + _ -> % Type is normal or entity + {Acc1, Rest3, State3} = parse_entity_content(Rest1, State1#xmerl_sax_parser_state{end_tags = []}, Acc, true), + {Acc1, Rest3, State3#xmerl_sax_parser_state{end_tags = ET}} end; parse_external_entity_1(?STRING_UNBOUND_REST(_C, _) = Bytes, - #xmerl_sax_parser_state{file_type=Type} = State) -> + #xmerl_sax_parser_state{file_type = Type, + end_tags = ET} = State, Acc) -> case Type of - normal -> - parse_entity_content(Bytes, State, [], true); - dtd -> - parse_doctype_decl(Bytes, State); - entity -> - parse_doctype_decl(Bytes, State) end; -parse_external_entity_1(Bytes, State) -> - unicode_incomplete_check([Bytes, State, fun parse_external_entity_1/2], - undefined). + dtd -> + case catch parse_doctype_decl(Bytes, State) of + {?STRING_EMPTY, State2} when is_record(State2, xmerl_sax_parser_state) -> + {[], ?STRING_EMPTY, State2}; + {_, State1} when is_record(State1, xmerl_sax_parser_state) -> + ?fatal_error(State1, "Not well-formed DTD"); + {fatal_error, {State2, "No more bytes"}} -> + {[], ?STRING_EMPTY, State2}; + {fatal_error, {State2, Message}} -> + ?fatal_error(State2, Message) + end; + _ -> + {Acc1, Rest1, State1} = parse_entity_content(Bytes, State#xmerl_sax_parser_state{end_tags = []}, Acc, true), + {Acc1, Rest1, State1#xmerl_sax_parser_state{end_tags = ET}} + + end; +parse_external_entity_1(Bytes, State, Acc) -> + unicode_incomplete_check([Bytes, State, Acc, fun parse_external_entity_1/3], + undefined). %%---------------------------------------------------------------------- %% Function : is_next_char_whitespace(Bytes, State) -> Result @@ -2009,6 +2268,8 @@ parse_external_entity_1(Bytes, State) -> %% Result : true | false %% Description: Checks if first character is whitespace. %%---------------------------------------------------------------------- +is_next_char_whitespace(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun is_next_char_whitespace/2); is_next_char_whitespace(?STRING_UNBOUND_REST(C, _), _) when ?is_whitespace(C) -> true; is_next_char_whitespace(?STRING_UNBOUND_REST(_C, _), _) -> @@ -2073,21 +2334,25 @@ parse_external_id(Bytes, State, OptionalSystemId) -> %% Description: Parse a system id. The function is used in two cases one %% where the system is optional and one where it's required. %%---------------------------------------------------------------------- +parse_system_id(?STRING_EMPTY, State, OptionalSystemId) -> + cf(?STRING_EMPTY, State, OptionalSystemId, fun parse_system_id/3); parse_system_id(?STRING_UNBOUND_REST(C, _) = Bytes, State, OptionalSystemId) when ?is_whitespace(C) -> {_WS, Rest, State1} = whitespace(Bytes, State, []), - check_system_litteral(Rest, State1, OptionalSystemId); + check_system_literal(Rest, State1, OptionalSystemId); parse_system_id(?STRING_UNBOUND_REST(_C, _) = Bytes, State, true) -> {"", Bytes, State}; parse_system_id(Bytes, State, OptionalSystemId) -> unicode_incomplete_check([Bytes, State, OptionalSystemId, fun parse_system_id/3], "whitespace expected"). -check_system_litteral(?STRING_UNBOUND_REST(C, Rest), State, _OptionalSystemId) when C == $'; C == $" -> - parse_system_litteral(Rest, State, C, []); -check_system_litteral(?STRING_UNBOUND_REST(_C, _) = Bytes, State, true) -> +check_system_literal(?STRING_EMPTY, State, OptionalSystemId) -> + cf(?STRING_EMPTY, State, OptionalSystemId, fun check_system_literal/3); +check_system_literal(?STRING_UNBOUND_REST(C, Rest), State, _OptionalSystemId) when C == $'; C == $" -> + parse_system_literal(Rest, State, C, []); +check_system_literal(?STRING_UNBOUND_REST(_C, _) = Bytes, State, true) -> {"", Bytes, State}; -check_system_litteral(Bytes, State, OptionalSystemId) -> - unicode_incomplete_check([Bytes, State, OptionalSystemId, fun check_system_litteral/3], +check_system_literal(Bytes, State, OptionalSystemId) -> + unicode_incomplete_check([Bytes, State, OptionalSystemId, fun check_system_literal/3], "\" or \' expected"). @@ -2102,20 +2367,24 @@ check_system_litteral(Bytes, State, OptionalSystemId) -> %% Description: Parse a public id. The function is used in two cases one %% where the following system is optional and one where it's required. %%---------------------------------------------------------------------- +parse_public_id(?STRING_EMPTY, State, OptionalSystemId) -> + cf(?STRING_EMPTY, State, OptionalSystemId, fun parse_public_id/3); parse_public_id(?STRING_UNBOUND_REST(C, _) = Bytes, State, OptionalSystemId) when ?is_whitespace(C) -> {_WS, Rest, State1} = whitespace(Bytes, State, []), - check_public_litteral(Rest, State1, OptionalSystemId); + check_public_literal(Rest, State1, OptionalSystemId); parse_public_id(Bytes, State,OptionalSystemId) -> unicode_incomplete_check([Bytes, State, OptionalSystemId, fun parse_public_id/3], "whitespace expected"). -check_public_litteral(?STRING_UNBOUND_REST(C, Rest), State, OptionalSystemId) when C == $'; C == $" -> - {PubId, Rest1, State1} = parse_pubid_litteral(Rest, State, C, []), +check_public_literal(?STRING_EMPTY, State, OptionalSystemId) -> + cf(?STRING_EMPTY, State, OptionalSystemId, fun check_public_literal/3); +check_public_literal(?STRING_UNBOUND_REST(C, Rest), State, OptionalSystemId) when C == $'; C == $" -> + {PubId, Rest1, State1} = parse_pubid_literal(Rest, State, C, []), {SysId, Rest2, State2} = parse_system_id(Rest1, State1, OptionalSystemId), {PubId, SysId, Rest2, State2}; -check_public_litteral(Bytes, State, OptionalSystemId) -> - unicode_incomplete_check([Bytes, State, OptionalSystemId, fun check_public_litteral/3], +check_public_literal(Bytes, State, OptionalSystemId) -> + unicode_incomplete_check([Bytes, State, OptionalSystemId, fun check_public_literal/3], "\" or \' expected"). @@ -2132,6 +2401,8 @@ parse_doctype_decl(?STRING_EMPTY, State) -> cf(?STRING_EMPTY, State, fun parse_doctype_decl/2); parse_doctype_decl(?STRING("<"), State) -> cf(?STRING("<"), State, fun parse_doctype_decl/2); +parse_doctype_decl(?STRING("<!"), State) -> + cf(?STRING("<!"), State, fun parse_doctype_decl/2); parse_doctype_decl(?STRING_REST("<?", Rest), State) -> case parse_pi(Rest, State) of {Rest1, State1} -> @@ -2140,23 +2411,37 @@ parse_doctype_decl(?STRING_REST("<?", Rest), State) -> IValue = ?TO_INPUT_FORMAT("<?"), {?APPEND_STRING(IValue, Rest), State1} end; -parse_doctype_decl(?STRING_REST("%", Rest), State) -> +parse_doctype_decl(?STRING_REST("%", Rest), #xmerl_sax_parser_state{file_type = Type} = State) -> {Ref, Rest1, State1} = parse_pe_reference(Rest, State), case Ref of - {internal_parameter, _, RefValue} -> - IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), - parse_doctype_decl(?APPEND_STRING(IValue, Rest1), State1); - {external_parameter, _, {PubId, SysId}} -> - State2 = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId), - parse_doctype_decl(Rest1, State2); - {not_found, Name} -> - case State#xmerl_sax_parser_state.skip_external_dtd of - false -> - ?fatal_error(State1, "Entity not declared: " ++ Name); %%WFC: Entity Declared - true -> - parse_doctype_decl(Rest1, State1) - end + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), + {Ctx, State2} = strip_context(State1), + case catch parse_doctype_decl(IValue, State2) of + {fatal_error, {State3, "No more bytes"}} -> + parse_doctype_decl(Rest1, add_context_back(Ctx, State3)); + {fatal_error, {State3, "Continuation function undefined"}} -> + parse_doctype_decl(Rest1, add_context_back(Ctx, State3)); + {_, State3} when is_record(State3, xmerl_sax_parser_state) -> + parse_doctype_decl(Rest1, add_context_back(Ctx, State3)); + {fatal_error, {State3, Mess}} -> + ?fatal_error(State3, Mess) + end; + {external_parameter, _, {PubId, SysId}} -> + {_, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = dtd}, PubId, SysId, []), + parse_doctype_decl(Rest1, State2#xmerl_sax_parser_state{file_type = Type}); + {not_found, _Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + parse_doctype_decl(Rest1, State1); + %?fatal_error(State1, "Entity not declared: " ++ Name); %%P69 VC: Entity Declared + true -> + parse_doctype_decl(Rest1, State1) + end end; +parse_doctype_decl(?STRING_REST("<![", Rest), State) -> + {Rest1, State1} = parse_doctype_decl_2(Rest, State), + parse_doctype_decl(Rest1, State1); parse_doctype_decl(?STRING_REST("<!", Rest1), State) -> parse_doctype_decl_1(Rest1, State); parse_doctype_decl(?STRING_REST("]", Rest), State) -> @@ -2251,6 +2536,68 @@ parse_doctype_decl_1(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_doctype_decl_1/2], "expecting ELEMENT, ATTLIST, ENTITY, NOTATION or comment"). +parse_doctype_decl_2(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_doctype_decl_2/2); +% conditionalSect +parse_doctype_decl_2(?STRING("I") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING("IN") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING("INC") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING("INCL") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING("INCLU") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING("INCLUD") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING_REST("INCLUDE", Rest), State) -> + case State#xmerl_sax_parser_state.file_type of + normal -> + ?fatal_error(State, "Conditional sections may only appear in the external DTD subset."); + _ -> + parse_include_sect(Rest, State) + end; +parse_doctype_decl_2(?STRING("IG") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING("IGN") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING("IGNO") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING("IGNOR") = Bytes, State) -> + cf(Bytes, State, fun parse_doctype_decl_2/2); +parse_doctype_decl_2(?STRING_UNBOUND_REST("IGNORE", Rest), State) -> + case State#xmerl_sax_parser_state.file_type of + normal -> + ?fatal_error(State, "Conditional sections may only appear in the external DTD subset."); + _ -> + parse_ignore_sect(Rest, State) + end; +parse_doctype_decl_2(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_doctype_decl_2(Rest, State1); +parse_doctype_decl_2(?STRING_UNBOUND_REST("%", Rest), State) -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(RefValue), + parse_doctype_decl_2(?APPEND_STRING(IValue, Rest1), State1); + {external_parameter, _, {PubId, SysId}} -> + {_, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = dtd}, PubId, SysId, []), + parse_doctype_decl_2(Rest1, State2); + {not_found, Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + ?fatal_error(State1, "Entity not declared: " ++ Name); %%WFC: Entity Declared + true -> + parse_doctype_decl_2(Rest1, State1) + end + end; + +parse_doctype_decl_2(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_doctype_decl_2/2], + "expecting INCLUDE or IGNORE"). + %%---------------------------------------------------------------------- %% Function : parse_element_decl(Rest, State) -> Result @@ -2269,15 +2616,21 @@ parse_element_decl(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_element_decl/2], "whitespace expected"). +parse_element_decl_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_element_decl_1/2); parse_element_decl_1(?STRING_UNBOUND_REST(C, Rest), State) -> case is_name_start(C) of - true -> - {Name, Rest1, State1} = parse_name(Rest, State, [C]), - {Model, Rest2, State2} = parse_element_content(Rest1, State1), - State3 = event_callback({elementDecl, Name, Model}, State2), - {Rest2, State3}; - false -> - ?fatal_error(State, "name expected") + true -> + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + case parse_element_content(Rest1, State1) of + {[],_,_} -> + ?fatal_error(State, "Content spec missing"); + {Model, Rest2, State2} -> + State3 = event_callback({elementDecl, Name, Model}, State2), + {Rest2, State3} + end; + false -> + ?fatal_error(State, "name expected") end; parse_element_decl_1(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_element_decl_1/2], @@ -2312,15 +2665,262 @@ parse_element_content(Bytes, State) -> %% Description: Parse contents of an element declaration. %%---------------------------------------------------------------------- parse_element_content_1(?STRING_EMPTY, State, Acc) -> - cf(?STRING_EMPTY, State, Acc, fun parse_element_content_1/3); + cf(?STRING_EMPTY, State, Acc, fun parse_element_content_1/3); +parse_element_content_1(?STRING("A") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_element_content_1/3); +parse_element_content_1(?STRING("AN") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_element_content_1/3); +parse_element_content_1(?STRING_REST("ANY", Rest), State, Acc) -> + parse_element_content_1(Rest, State, "YNA" ++ Acc); +parse_element_content_1(?STRING("E") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_element_content_1/3); +parse_element_content_1(?STRING("EM") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_element_content_1/3); +parse_element_content_1(?STRING("EMP") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_element_content_1/3); +parse_element_content_1(?STRING("EMPT") = Bytes, State, Acc) -> + cf(Bytes, State, Acc, fun parse_element_content_1/3); +parse_element_content_1(?STRING_REST("EMPTY", Rest), State, Acc) -> + parse_element_content_1(Rest, State, "YTPME" ++ Acc); parse_element_content_1(?STRING_REST(">", Rest), State, Acc) -> {lists:reverse(delete_leading_whitespace(Acc)), Rest, State}; -parse_element_content_1(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> - parse_element_content_1(Rest, State, [C|Acc]); +parse_element_content_1(?STRING_UNBOUND_REST("(", Rest), State, []) -> + parse_element_content_2(Rest, State, [$(], {1, [none]}); +parse_element_content_1(?STRING_UNBOUND_REST("(", _), State, _) -> + ?fatal_error(State, "> expected"); +parse_element_content_1(?STRING_UNBOUND_REST("%", Rest), State, Acc) -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(RefValue), + parse_element_content_1(?APPEND_STRING(IValue, Rest1), State1, Acc); + {external_parameter, _, {PubId, SysId}} -> + {Acc1, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = dtd}, PubId, SysId, Acc), + parse_element_content_1(Rest1, State2, Acc1); + {not_found, Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + ?fatal_error(State1, "Entity not declared: " ++ Name); %%WFC: Entity Declared + true -> + parse_element_content_1(Rest1, State1, Acc) + end + end; +parse_element_content_1(?STRING_UNBOUND_REST(C, _) = Rest, State, Acc) when ?is_whitespace(C) -> + {WS, Rest1, State1} = whitespace(Rest, State, []), + parse_element_content_1(Rest1, State1, WS ++ Acc); +parse_element_content_1(?STRING_UNBOUND_REST(C, _), State, _Acc) -> + ?fatal_error(State, "'(' expected got " ++ [C]); parse_element_content_1(Bytes, State, Acc) -> unicode_incomplete_check([Bytes, State, Acc, fun parse_element_content_1/3], undefined). +%%---------------------------------------------------------------------- +%% Function : parse_element_content_2(Rest, State, Acc, Depth) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Acc = string() +%% Result : {Content, Rest, State} +%% Content = string() +%% Description: Parse element declaration Mixed | children. +%%---------------------------------------------------------------------- +parse_element_content_2(?STRING_EMPTY, State, Acc, Depth) -> + cf(?STRING_EMPTY, State, Acc, Depth, fun parse_element_content_2/4); +parse_element_content_2(?STRING("#") = Bytes, State, Acc, Depth) -> + cf(Bytes, State, Acc, Depth, fun parse_element_content_2/4); +parse_element_content_2(?STRING("#P") = Bytes, State, Acc, Depth) -> + cf(Bytes, State, Acc, Depth, fun parse_element_content_2/4); +parse_element_content_2(?STRING("#PC") = Bytes, State, Acc, Depth) -> + cf(Bytes, State, Acc, Depth, fun parse_element_content_2/4); +parse_element_content_2(?STRING("#PCD") = Bytes, State, Acc, Depth) -> + cf(Bytes, State, Acc, Depth, fun parse_element_content_2/4); +parse_element_content_2(?STRING("#PCDA") = Bytes, State, Acc, Depth) -> + cf(Bytes, State, Acc, Depth, fun parse_element_content_2/4); +parse_element_content_2(?STRING("#PCDAT") = Bytes, State, Acc, Depth) -> + cf(Bytes, State, Acc, Depth, fun parse_element_content_2/4); +parse_element_content_2(?STRING_REST("#PCDATA", _), State, _, {_, ['|'|_]}) -> + ?fatal_error(State, "#PCDATA can only come first in element content."); +parse_element_content_2(?STRING_REST("#PCDATA", Rest), State, Acc, {1, Sep}) -> + parse_element_content_4(Rest, State, "ATADCP#" ++ Acc, {1, [any|Sep]}); +parse_element_content_2(?STRING_UNBOUND_REST("%", Rest), State, Acc, Depth) -> + case State#xmerl_sax_parser_state.file_type of + normal -> + % not allowed locally + ?fatal_error(State, "PE not allowed in declaration."); %%WFC: Entity Declared + _ -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(RefValue), + parse_element_content_2(?APPEND_STRING(IValue, Rest1), State1, Acc, Depth); + {external_parameter, _, {PubId, SysId}} -> + {Acc1, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId, Acc), + parse_element_content_2(Rest1, State2, Acc1, Depth); + {not_found, Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + ?fatal_error(State1, "Entity not declared: " ++ Name); %%WFC: Entity Declared + true -> + parse_element_content_2(Rest1, State1, Acc, Depth) + end + end + end; +parse_element_content_2(?STRING_UNBOUND_REST(")", Rest), State, Acc, {1, _}) -> + case lists:all(fun(C) when ?is_whitespace(C) -> true; + ($()-> true; + (_) -> false + end, Acc) of + true -> + ?fatal_error(State, "Element content missing."); + false when Acc == "(" -> + ?fatal_error(State, "Element content missing."); + false -> + case Acc of + [$,|_] -> + ?fatal_error(State, "expecting value"); + [$||_] -> + ?fatal_error(State, "expecting value"); + _ -> + {Acc1, Rest1, State1} = parse_element_content_3(Rest, State, [$)|Acc]), + parse_element_content_1(Rest1, State1, Acc1) + end + end; +parse_element_content_2(?STRING_UNBOUND_REST("(", Rest), State, Acc, {Depth, [H|Sep]}) -> + H1 = if H == none -> any; + H == any -> ?fatal_error(State, "expecting separator"); + true -> + check_separator(Acc, H, State) + end, + parse_element_content_2(Rest, State, [$(|Acc], {Depth + 1, [none,H1|Sep]}); +parse_element_content_2(?STRING_UNBOUND_REST(")", Rest), State, Acc, {Depth, [_|Sep]}) -> + case Acc of + [$,|_] -> + ?fatal_error(State, "expecting value"); + [$||_] -> + ?fatal_error(State, "expecting value"); + _ -> + {Acc1, Rest1, State1} = parse_element_content_3(Rest, State, [$)|Acc]), + parse_element_content_2(Rest1, State1, Acc1, {Depth - 1, Sep}) + end; +parse_element_content_2(?STRING_UNBOUND_REST(C, _) = Rest, State, Acc, Depth) when ?is_whitespace(C) -> + {WS, Rest1, State1} = whitespace(Rest, State, []), + parse_element_content_2(Rest1, State1, WS ++ Acc, Depth); +parse_element_content_2(?STRING_UNBOUND_REST("|", Rest), State, Acc, {Depth, [any|T]}) -> + parse_element_content_2(Rest, State, [$||Acc], {Depth, ['|'|T]}); +parse_element_content_2(?STRING_UNBOUND_REST("|", Rest), State, Acc, {_, ['|'|_]} = Sep) -> + case Acc of + [$||_] -> + ?fatal_error(State, "expecting value"); + _ -> + parse_element_content_2(Rest, State, [$||Acc], Sep) + end; +parse_element_content_2(?STRING_UNBOUND_REST(",", Rest), State, Acc, {Depth, [any|T]}) -> + parse_element_content_2(Rest, State, [$,|Acc], {Depth, [','|T]}); +parse_element_content_2(?STRING_UNBOUND_REST(",", Rest), State, Acc, {_, [','|_]} = Sep) -> + case Acc of + [$,|_] -> + ?fatal_error(State, "expecting value"); + _ -> + parse_element_content_2(Rest, State, [$,|Acc], Sep) + end; +parse_element_content_2(?STRING_UNBOUND_REST("|", _), State, _Acc, {_, [H|_]}) -> + ?fatal_error(State, "Expected: " ++ atom_to_list(H)); +parse_element_content_2(?STRING_UNBOUND_REST(",", _), State, _Acc, {_, [H|_]}) -> + ?fatal_error(State, "Expected: " ++ atom_to_list(H)); +parse_element_content_2(?STRING_UNBOUND_REST(C, Rest), State, Acc, {Depth, [H|T]}) -> + case is_name_start(C) of + true -> + H1 = if H == none -> any; + H == any -> ?fatal_error(State, "expecting separator"); + true -> + check_separator(Acc, H, State) + end, + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + {Acc1, Rest2, State2} = parse_element_content_3(Rest1, State1, lists:reverse(Name) ++ Acc), + parse_element_content_2(Rest2, State2, Acc1, {Depth, [H1|T]}); + false -> + ?fatal_error(State, "name expected: " ++ [C]) + end; +parse_element_content_2(Bytes, State, Acc, _Depth) -> + parse_element_content_1(Bytes, State, Acc). + +% maybe parse the cardinality +parse_element_content_3(?STRING_EMPTY, State, Acc) -> + cf(?STRING_EMPTY, State, Acc, fun parse_element_content_3/3); +parse_element_content_3(?STRING_UNBOUND_REST("?", Rest), State, Acc) -> + {[$?|Acc], Rest, State}; +parse_element_content_3(?STRING_UNBOUND_REST("+", Rest), State, Acc) -> + {[$+|Acc], Rest, State}; +parse_element_content_3(?STRING_UNBOUND_REST("*", Rest), State, Acc) -> + {[$*|Acc], Rest, State}; +parse_element_content_3(Rest, State, Acc) -> + {Acc, Rest, State}. + +% Mixed Content [51] +parse_element_content_4(?STRING_EMPTY, State, Acc, Depth) -> + cf(?STRING_EMPTY, State, Acc, Depth, fun parse_element_content_4/4); +parse_element_content_4(?STRING(")") = Bytes, State, Acc, Depth) -> + cf(Bytes, State, Acc, Depth, fun parse_element_content_4/4); +parse_element_content_4(?STRING_UNBOUND_REST("|", Rest), State, Acc, {Depth, [any|T]}) -> + parse_element_content_4(Rest, State, [$||Acc], {Depth, ['|'|T]}); +parse_element_content_4(?STRING_UNBOUND_REST("|", Rest), State, Acc, {_, ['|'|_]} = Sep) -> + case Acc of + [$||_] -> + ?fatal_error(State, "expecting value"); + _ -> + parse_element_content_4(Rest, State, [$||Acc], Sep) + end; +parse_element_content_4(?STRING_UNBOUND_REST("|", Rest), State, Acc, Depth) -> + parse_element_content_4(Rest, State, [$||Acc], Depth); +parse_element_content_4(?STRING_UNBOUND_REST(C, Rest), State, Acc, Depth) when ?is_whitespace(C) -> + parse_element_content_4(Rest, State, [C|Acc], Depth); + +parse_element_content_4(?STRING_UNBOUND_REST(")*", Rest), State, Acc, {1, _}) -> + parse_element_content_1(Rest, State, [$*,$)|Acc]); +parse_element_content_4(?STRING_UNBOUND_REST(")", _), State, _, {1, [','|_]}) -> + ?fatal_error(State, ")* expected after mixed content"); +parse_element_content_4(?STRING_UNBOUND_REST(")", _), State, _, {1, ['|'|_]}) -> + ?fatal_error(State, ")* expected after mixed content"); +parse_element_content_4(?STRING_UNBOUND_REST(")", Rest), State, Acc, {1, _}) -> + parse_element_content_1(Rest, State, [$)|Acc]); + +parse_element_content_4(?STRING_UNBOUND_REST(")*", Rest), State, Acc, {Depth, [_|T]}) -> + parse_element_content_2(Rest, State, [$*,$)|Acc], {Depth - 1, T}); +parse_element_content_4(?STRING_UNBOUND_REST(")", Rest), State, Acc, {Depth, [_|T]}) -> + parse_element_content_2(Rest, State, [$)|Acc], {Depth - 1, T}); +parse_element_content_4(?STRING_UNBOUND_REST("%", Rest), State, Acc, Depth) -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), + parse_element_content_4(?APPEND_STRING(IValue, Rest1), State1, Acc, Depth); + {external_parameter, _, {_PubId, _SysId}} -> + ?fatal_error(State1, "External parameter name"); + {not_found, _Name} -> + ?fatal_error(State1, "Unknown reference parameter name") + end; +parse_element_content_4(?STRING_UNBOUND_REST(C, Rest), State, Acc, {Depth, [H|T]}) -> + case is_name_start(C) of + true -> + H1 = if H == none -> any; + H == any -> ?fatal_error(State, "expecting separator"); + true -> + check_separator(Acc, H, State) + end, + {Name, Rest1, State1} = parse_name(Rest, State, [C]), + parse_element_content_4(Rest1, State1, lists:reverse(Name) ++ Acc, {Depth, [H1|T]}); + false -> + ?fatal_error(State, "name expected: " ++ [C]) + end; +parse_element_content_4(Rest1, State, Acc, Depth) -> + parse_element_content_2(Rest1, State, Acc, Depth). + +check_separator([W|Acc], S, State) when ?is_whitespace(W) -> + check_separator(Acc, S, State); +check_separator([$,|_], ',', _) -> ','; +check_separator([$||_], '|', _) -> '|'; +check_separator(_, _, State) -> + ?fatal_error(State, "Expected serarator"). + delete_leading_whitespace([C |Acc]) when ?is_whitespace(C)-> delete_leading_whitespace(Acc); delete_leading_whitespace(Acc) -> @@ -2339,15 +2939,55 @@ parse_att_list_decl(?STRING_EMPTY, State) -> parse_att_list_decl(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> {_WS, Rest, State1} = whitespace(Bytes, State, []), parse_att_list_decl_1(Rest, State1); +parse_att_list_decl(?STRING_UNBOUND_REST("%", Rest), State) -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), + parse_att_list_decl(?APPEND_STRING(IValue, Rest1), State1); + {external_parameter, _, {PubId, SysId}} -> + {_, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = dtd}, PubId, SysId, []), + parse_att_list_decl(Rest1, State2); + {not_found, Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + ?fatal_error(State1, "Entity not declared: " ++ Name); %%WFC: Entity Declared + true -> + parse_att_list_decl(Rest1, State1) + end + end; parse_att_list_decl(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_att_list_decl/2], "whitespace expected"). - +parse_att_list_decl_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_att_list_decl_1/2); +parse_att_list_decl_1(?STRING_UNBOUND_REST("%", Rest), State) -> + case State#xmerl_sax_parser_state.file_type of + normal -> + ?fatal_error(State, "Parsed entities not allowed in Internal subset"); %%WFC: PEs in Internal Subset + _ -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(RefValue), + parse_att_list_decl_1(?APPEND_STRING(IValue, Rest1), State1); + {external_parameter, _, {PubId, SysId}} -> + {_, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId, []), + parse_att_list_decl(Rest1, State2); + {not_found, Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + ?fatal_error(State1, "Entity not declared: " ++ Name); %%WFC: Entity Declared + true -> + parse_att_list_decl(Rest1, State1) + end + end + end; parse_att_list_decl_1(?STRING_UNBOUND_REST(C, Rest), State) -> case is_name_start(C) of true -> - {ElementName, Rest1, State1} = parse_name(Rest, State, [C]), + {ElementName, Rest1, State1} = parse_ns_name(Rest, State, [], [C]), parse_att_defs(Rest1, State1, ElementName); false -> ?fatal_error(State, "name expected") @@ -2373,21 +3013,69 @@ parse_att_defs(?STRING_REST(">", Rest), State, _ElementName) -> parse_att_defs(?STRING_UNBOUND_REST(C, _) = Rest, State, ElementName) when ?is_whitespace(C) -> {_WS, Rest1, State1} = whitespace(Rest, State, []), parse_att_defs(Rest1, State1, ElementName); +parse_att_defs(?STRING_UNBOUND_REST("%", Rest), #xmerl_sax_parser_state{file_type = Type} = State, ElementName) -> + case Type of + normal -> + ?fatal_error(State, "Parsed entities not allowed in Internal subset"); %%WFC: PEs in Internal Subset + _ -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), + parse_att_defs(?APPEND_STRING(IValue, Rest1), State1, ElementName); + {external_parameter, _, {PubId, SysId}} -> + {_, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId, []), + parse_att_defs(Rest1, State2#xmerl_sax_parser_state{file_type = Type}, ElementName); + {not_found, Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + ?fatal_error(State1, "Entity not declared: " ++ Name); %%WFC: Entity Declared + true -> + parse_att_defs(Rest1, State1, ElementName) + end + end + end; parse_att_defs(?STRING_UNBOUND_REST(C, Rest), State, ElementName) -> case is_name_start(C) of - true -> - {AttrName, Rest1, State1} = parse_name(Rest, State, [C]), - {Type, Rest2, State2} = parse_att_type(Rest1, State1), - {Mode, Value, Rest3, State3} = parse_default_decl(Rest2, State2), - State4 = event_callback({attributeDecl, ElementName, AttrName, Type, Mode, Value}, State3), - parse_att_defs(Rest3, State4, ElementName); - false -> - ?fatal_error(State, "whitespace or name expected") + true -> + {AttrName, Rest1, State1} = parse_ns_name(Rest, State, [], [C]), + {Type, Rest2, State2} = parse_att_type(Rest1, State1), + {Mode, Value, Rest3, State3} = parse_default_decl(Rest2, State2), + State4 = event_callback({attributeDecl, ElementName, AttrName, Type, Mode, Value}, State3), + State5 = + if + Type == "CDATA" andalso Mode == "#FIXED"; + Type == "CDATA" andalso Mode == ""; + Type == "" andalso Mode == "#FIXED"; + Type == "" andalso Mode == "" -> + % non-normalized default + add_default_attribute({ElementName, AttrName, Value}, State4); + Mode == "#FIXED"; + Mode == "" -> + % default and normalized + add_default_attribute({ElementName, AttrName, {Value, normalize}}, State4); + Type == "CDATA"; + Type == "" -> + % as-is + add_default_attribute({ElementName, AttrName, ignore}, State4); + true -> + % just normalize + add_default_attribute({ElementName, AttrName, normalize}, State4) + end, + parse_att_defs(Rest3, State5, ElementName); + false -> + ?fatal_error(State, "whitespace or name expected") end; parse_att_defs(Bytes, State, ElementName) -> unicode_incomplete_check([Bytes, State, ElementName, fun parse_att_defs/3], undefined). +add_default_attribute({ElementName, AttrName, Value}, + #xmerl_sax_parser_state{attribute_values = Atts} = State) -> + % first value wins when there are duplicates + Key = {ElementName, AttrName}, + Atts1 = merge_on_key({Key, Value}, Atts), + State#xmerl_sax_parser_state{attribute_values = Atts1}. %%---------------------------------------------------------------------- %% Function : parse_att_type(Rest, State) -> Result @@ -2409,16 +3097,35 @@ parse_att_type(?STRING_EMPTY, State) -> parse_att_type(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> {_WS, Rest, State1} = whitespace(Bytes, State, []), case parse_att_type_1(Rest, State1, []) of - {Type, Rest1, State2} when Type == "("; Type == "NOTATION" -> - {T, Rest2, State3} = parse_until_right_paren(Rest1, State2, []), - {Type ++ T, Rest2, State3}; - {Type, Rest1, State2} -> - case check_att_type(Type) of - true -> - {Type, Rest1, State2}; - false -> - ?fatal_error(State2, "wrong attribute type") - end + {"(", Rest1, State2} -> + {T, Rest2, State3} = parse_until_right_paren(Rest1, State2, []), + case T of + ")" -> + ?fatal_error(State3, "Empty attribute enumerated type."); + _ -> + {"(" ++ T, Rest2, State3} + end; + {"NOTATION", Rest1, State2} -> + {_WS, Rest2, State3} = whitespace(Rest1, State2, []), + case parse_att_type_1(Rest2, State3, []) of + {"(", Rest3, State4} -> + {T, Rest4, State5} = parse_until_right_paren(Rest3, State4, []), + case T of + ")" -> + ?fatal_error(State5, "Empty attribute notation type."); + _ -> + {"(" ++ T, Rest4, State5} + end; + {Type, _, _} -> + ?fatal_error(State2, "wrong attribute type: " ++ Type) + end; + {Type, Rest1, State2} -> + case check_att_type(Type) of + true -> + {Type, Rest1, State2}; + false -> + ?fatal_error(State2, "wrong attribute type: " ++ Type) + end end; parse_att_type(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_att_type/2], @@ -2438,6 +3145,23 @@ parse_att_type_1(?STRING_EMPTY, State, Acc) -> cf(?STRING_EMPTY, State, Acc, fun parse_att_type_1/3); parse_att_type_1(?STRING_UNBOUND_REST(C, _) = Bytes, State, Acc) when ?is_whitespace(C) -> {lists:reverse(Acc), Bytes, State}; +parse_att_type_1(?STRING_UNBOUND_REST("%", Rest), State, Acc) -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), + parse_att_type_1(?APPEND_STRING(IValue, Rest1), State1, Acc); + {external_parameter, _, {PubId, SysId}} -> + {Acc1, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId, Acc), + parse_att_type_1(Rest1, State2, Acc1); + {not_found, Name} -> + case State#xmerl_sax_parser_state.skip_external_dtd of + false -> + ?fatal_error(State1, "Entity not declared: " ++ Name); %%WFC: Entity Declared + true -> + parse_att_type_1(Rest1, State1, Acc) + end + end; parse_att_type_1(?STRING_REST("(", Rest), State, []) -> {"(", Rest, State}; parse_att_type_1(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> @@ -2485,8 +3209,16 @@ parse_until_right_paren(?STRING_EMPTY, State, Acc) -> cf(?STRING_EMPTY, State, Acc, fun parse_until_right_paren/3); parse_until_right_paren(?STRING_REST(")", Rest), State, Acc) -> {lists:reverse(")" ++ Acc), Rest, State}; -parse_until_right_paren(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> +parse_until_right_paren(?STRING_UNBOUND_REST(C, Rest), State, Acc) when ?is_whitespace(C) -> parse_until_right_paren(Rest, State, [C|Acc]); +parse_until_right_paren(?STRING_UNBOUND_REST(C, Rest), State, Acc) -> + TokenChar = C == $| orelse is_name_char(C), + case TokenChar of + true -> + parse_until_right_paren(Rest, State, [C|Acc]); + false -> + ?fatal_error(State, lists:flatten(io_lib:format("Bad character in enumeration: ~p", [[C]]))) + end; parse_until_right_paren(Bytes, State, Acc) -> unicode_incomplete_check([Bytes, State, Acc, fun parse_until_right_paren/3], undefined). @@ -2505,7 +3237,7 @@ parse_default_decl(?STRING_EMPTY, State) -> cf(?STRING_EMPTY, State, fun parse_default_decl/2); parse_default_decl(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> {_WS, Rest, State1} = whitespace(Bytes, State, []), - parse_default_decl_1(Rest, State1); + parse_default_decl_2(Rest, State1); parse_default_decl(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_default_decl/2], "whitespace expected"). @@ -2523,6 +3255,8 @@ parse_default_decl_1(?STRING_EMPTY, State) -> cf(?STRING_EMPTY, State, fun parse_default_decl_1/2); parse_default_decl_1(?STRING_REST("#", _Rest) = Bytes, State) -> case Bytes of + ?STRING("#") -> + cf(Bytes, State, fun parse_default_decl_1/2); ?STRING("#R") -> cf(Bytes, State, fun parse_default_decl_1/2); ?STRING("#RE") -> @@ -2575,7 +3309,26 @@ parse_default_decl_1(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_default_decl_1/2], "bad default declaration"). +parse_default_decl_2(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_default_decl_2/2); +parse_default_decl_2(?STRING_REST("%", Rest), State) -> + {Ref, Rest1, State1} = parse_pe_reference(Rest, State), + case Ref of + {internal_parameter, _, RefValue} -> + IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), + parse_default_decl(?APPEND_STRING(IValue, Rest1), State1); + {external_parameter, _, {PubId, SysId}} -> + {Acc, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = entity}, PubId, SysId, []), + IValue = ?TO_INPUT_FORMAT(" " ++ lists:reverse(Acc) ++ " "), + parse_default_decl(?APPEND_STRING(IValue, Rest1), State2); + {not_found, _Name} -> + ?fatal_error(State, "REQUIRED, IMPLIED or FIXED expected") + end; +parse_default_decl_2(Bytes, State) -> + parse_default_decl_1(Bytes, State). +parse_fixed(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_fixed/2); parse_fixed(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> {DefaultValue, Rest, State1} = parse_att_value(Bytes, State), % parse_att_value removes leading WS {"#FIXED", DefaultValue, Rest, State1}; @@ -2612,6 +3365,8 @@ parse_entity_decl(Bytes, State) -> %%---------------------------------------------------------------------- parse_entity_decl_1(?STRING_EMPTY, State) -> cf(?STRING_EMPTY, State, fun parse_entity_decl_1/2); +parse_entity_decl_1(?STRING("%") = Bytes, State) -> + cf(Bytes, State, fun parse_entity_decl_1/2); parse_entity_decl_1(?STRING_REST("%", Rest), State) -> case is_next_char_whitespace(Rest, State) of true -> @@ -2638,9 +3393,8 @@ parse_entity_decl_1(Bytes, State) -> unicode_incomplete_check([Bytes, State, fun parse_entity_decl_1/2], undefined). - - - +parse_pe_name(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_pe_name/2); parse_pe_name(?STRING_UNBOUND_REST(C, Rest), State) -> case is_name_start(C) of true -> @@ -2698,6 +3452,8 @@ parse_entity_def(Bytes, State, Name) -> "\", \', SYSTEM or PUBLIC expected"). +parse_def_end(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_def_end/2); parse_def_end(?STRING_REST(">", Rest), State) -> {Rest, State}; parse_def_end(Bytes, State) -> @@ -2743,16 +3499,16 @@ parse_entity_value(?STRING("\r"), State, Stop, Acc) -> cf(?STRING("\r"), State, Stop, Acc, fun parse_entity_value/4); parse_entity_value(?STRING_REST("\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> parse_entity_value(Rest, - State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?lf |Acc]); parse_entity_value(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> parse_entity_value(Rest, - State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?lf |Acc]); parse_entity_value(?STRING_REST("\r", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> parse_entity_value(Rest, - State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?lf |Acc]); parse_entity_value(?STRING_REST("\t", Rest), #xmerl_sax_parser_state{line_no=N} = State, Stop, Acc) -> parse_entity_value(Rest, - State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?space |Acc]); + State#xmerl_sax_parser_state{line_no=N+1}, Stop, [?tab |Acc]); parse_entity_value(?STRING_REST("&", Rest), State, Stop, Acc) -> {Ref, Rest1, State1} = parse_reference(Rest, State, false), case Ref of @@ -2776,11 +3532,16 @@ parse_entity_value(?STRING_REST("%", Rest), #xmerl_sax_parser_state{file_type=Ty "markup declarations in the internal DTD subset: " ++ Name); _ -> case Ref of + {internal_parameter, _, []} -> + parse_entity_value(Rest1, State1, Stop, Acc); {internal_parameter, _, RefValue} -> - IValue = ?TO_INPUT_FORMAT(" " ++ RefValue ++ " "), - parse_entity_value(?APPEND_STRING(IValue, Rest1), State1, Stop, Acc); - {external_parameter, _, {_PubId, _SysId}} -> - ?fatal_error(State1, "Parameter references in entity value not supported yet."); + IValue = ?TO_INPUT_FORMAT(RefValue), + {Ctx, State2} = strip_context(State1), + {Acc1, ?STRING_EMPTY, State3} = parse_entity_content(IValue, State2, Acc, false), + parse_entity_value(Rest1, add_context_back(Ctx, State3), Stop, Acc1); + {external_parameter, _, {PubId, SysId}} -> + {Acc1, State2} = parse_external_entity(State1#xmerl_sax_parser_state{file_type = text}, PubId, SysId, Acc), + parse_entity_value(Rest1, State2#xmerl_sax_parser_state{file_type = Type}, Stop, Acc1); {not_found, Name} -> case State#xmerl_sax_parser_state.skip_external_dtd of false -> @@ -2832,6 +3593,8 @@ parse_ndata_decl(Bytes, State) -> "NDATA or > expected"). +parse_ndata_decl_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_ndata_decl_1/2); parse_ndata_decl_1(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> {_WS, Rest, State1} = whitespace(Bytes, State, []), parse_ndecl_name(Rest, State1); @@ -2840,6 +3603,8 @@ parse_ndata_decl_1(Bytes, State) -> "whitespace expected"). +parse_ndecl_name(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_ndecl_name/2); parse_ndecl_name(?STRING_UNBOUND_REST(C, Rest), State) -> case is_name_start(C) of true -> @@ -2885,6 +3650,118 @@ parse_pe_def(Bytes, State, Name) -> unicode_incomplete_check([Bytes, State, Name, fun parse_pe_def/3], "\", \', SYSTEM or PUBLIC expected"). +%%---------------------------------------------------------------------- +%% Function : parse_include_sect(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse an INCLUDE section. +%% [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' +%%---------------------------------------------------------------------- +parse_include_sect(?STRING_EMPTY, State) -> + case catch cf(?STRING_EMPTY, State, fun parse_include_sect/2) of + {Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> + {Rest, State1}; + {fatal_error, {State1, "No more bytes"}} -> + ?fatal_error(State1, "Unexpected EOF."); + {fatal_error, {State1, Mess}} -> + ?fatal_error(State1, Mess) + end; +parse_include_sect(?STRING("]") = Bytes, State) -> + cf(Bytes, State, fun parse_include_sect/2); +parse_include_sect(?STRING_REST("\n", Rest), #xmerl_sax_parser_state{line_no=N} = State) -> + parse_include_sect(Rest, State#xmerl_sax_parser_state{line_no=N+1}); +parse_include_sect(?STRING_REST("\r\n", Rest), #xmerl_sax_parser_state{line_no=N} = State) -> + parse_include_sect(Rest, State#xmerl_sax_parser_state{line_no=N+1}); +parse_include_sect(?STRING_REST("\r", Rest), #xmerl_sax_parser_state{line_no=N} = State) -> + parse_include_sect(Rest, State#xmerl_sax_parser_state{line_no=N+1}); +parse_include_sect(?STRING_UNBOUND_REST(C, Rest), State) when ?is_whitespace(C) -> + parse_include_sect(Rest, State); +parse_include_sect(?STRING_UNBOUND_REST("]>", Rest), State) -> + {Rest, State}; +parse_include_sect(?STRING_UNBOUND_REST("[", Rest), State) -> + parse_include_sect_1(Rest, State); +parse_include_sect(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_include_sect/2], + "subset declaration expected"). + +parse_include_sect_1(?STRING_EMPTY, State) -> + case catch cf(?STRING_EMPTY, State, fun parse_include_sect_1/2) of + {Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> + {Rest, State1}; + {fatal_error, {State1, "No more bytes"}} -> + ?fatal_error(State1, "Unexpected EOF."); + {fatal_error, {State1, Mess}} -> + ?fatal_error(State1, Mess) + end; +parse_include_sect_1(?STRING("]") = Bytes, State) -> + cf(Bytes, State, fun parse_include_sect_1/2); +parse_include_sect_1(?STRING_UNBOUND_REST("]>", Rest), State) -> + {Rest, State}; +parse_include_sect_1(?STRING_UNBOUND_REST(_, _) = Bytes, State) -> + {Rest1, State1} = parse_text_decl(Bytes, State), + case catch parse_doctype_decl(Rest1, State1) of + {Rest2, State2} when is_record(State2, xmerl_sax_parser_state) -> + parse_include_sect_1(Rest2, State2); + {fatal_error, {State2, "No more bytes"}} -> + ?fatal_error(State2, "Unexpected EOF."); + {fatal_error, {State2, Message}} -> + ?fatal_error(State2, Message) + end; +parse_include_sect_1(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_include_sect_1/2], + "]> expected"). + +%%---------------------------------------------------------------------- +%% Function : parse_ignore_sect(Rest, State) -> Result +%% Parameters: Rest = string() | binary() +%% State = #xmerl_sax_parser_state{} +%% Result : {Rest, State} +%% Description: Parse an INCLUDE section. +%% [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' +%%---------------------------------------------------------------------- +parse_ignore_sect(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_ignore_sect/2); +parse_ignore_sect(?STRING("]") = Bytes, State) -> + cf(Bytes, State, fun parse_ignore_sect/2); +parse_ignore_sect(?STRING_UNBOUND_REST(C, _) = Bytes, State) when ?is_whitespace(C) -> + {_WS, Rest, State1} = whitespace(Bytes, State, []), + parse_ignore_sect(Rest, State1); +parse_ignore_sect(?STRING_UNBOUND_REST("[", Rest), State) -> + parse_ignore_sect_1(Rest, State, 1); +parse_ignore_sect(Bytes, State) -> + unicode_incomplete_check([Bytes, State, fun parse_ignore_sect/2], + "whitespace expected"). + +parse_ignore_sect_1(?STRING_EMPTY, State, Depth) -> + case catch cf(?STRING_EMPTY, State, Depth, fun parse_ignore_sect_1/3) of + {Rest, State1} when is_record(State1, xmerl_sax_parser_state) -> + {Rest, State1}; + {fatal_error, {State1, "No more bytes"}} -> + ?fatal_error(State1, "Unexpected EOF."); + {fatal_error, {State1, Message}} -> + ?fatal_error(State1, Message) + end; +parse_ignore_sect_1(?STRING("<") = Bytes, State, Depth) -> + cf(Bytes, State, Depth, fun parse_ignore_sect_1/3); +parse_ignore_sect_1(?STRING("<!") = Bytes, State, Depth) -> + cf(Bytes, State, Depth, fun parse_ignore_sect_1/3); +parse_ignore_sect_1(?STRING("]") = Bytes, State, Depth) -> + cf(Bytes, State, Depth, fun parse_ignore_sect_1/3); +parse_ignore_sect_1(?STRING("]]") = Bytes, State, Depth) -> + cf(Bytes, State, Depth, fun parse_ignore_sect_1/3); +parse_ignore_sect_1(?STRING_UNBOUND_REST("]]>", Rest), State, 1) -> + {Rest, State}; +parse_ignore_sect_1(?STRING_UNBOUND_REST("]]>", Rest), State, Depth) -> + parse_ignore_sect_1(Rest, State, Depth - 1); +parse_ignore_sect_1(?STRING_UNBOUND_REST("<![", Rest), State, Depth) -> + parse_ignore_sect_1(Rest, State, Depth + 1); +parse_ignore_sect_1(?STRING_UNBOUND_REST(_, Rest), State, Depth) -> + parse_ignore_sect_1(Rest, State, Depth); +parse_ignore_sect_1(Bytes, State, _) -> + unicode_incomplete_check([Bytes, State, fun parse_ignore_sect_1/3], + "Char expected"). + %%---------------------------------------------------------------------- %% Function : parse_notation_decl(Rest, State) -> Result @@ -2904,6 +3781,8 @@ parse_notation_decl(Bytes, State) -> "whitespace expected"). +parse_notation_decl_1(?STRING_EMPTY, State) -> + cf(?STRING_EMPTY, State, fun parse_notation_decl_1/2); parse_notation_decl_1(?STRING_UNBOUND_REST(C, Rest), State) -> case is_name_start(C) of true -> @@ -3052,6 +3931,8 @@ is_pubid_char($;) -> true; is_pubid_char($=) -> true; +is_pubid_char($?) -> + true; is_pubid_char($@) -> true; is_pubid_char($_) -> @@ -3872,3 +4753,176 @@ format_error(Tag, State, Reason) -> filter_endtag_stack(State#xmerl_sax_parser_state.end_tags), State#xmerl_sax_parser_state.event_state}. +external_continuation_cb({IoDevice, _}) -> + case file:read(IoDevice, 1024) of + eof -> + {<<>>, {IoDevice, <<>>}}; + {error, Err} -> + throw({error, Err}); + {ok, FileBin} -> + {FileBin, {IoDevice, <<>>}} + end. + +external_continuation_cb(FileEnc, FileEnc) -> + fun external_continuation_cb/1; +external_continuation_cb(FileEnc, BaseEnc) -> + fun({IoDevice, Rest}) -> + case file:read(IoDevice, 1024) of + eof when Rest == <<>> -> + {<<>>, {IoDevice, <<>>}}; + eof -> + {unicode:characters_to_binary(Rest, FileEnc, BaseEnc), {IoDevice, <<>>}}; + {error, Err} -> + throw({error, Err}); + {ok, FileBin} -> + Comp = <<Rest/binary, FileBin/binary>>, + case unicode:characters_to_binary(Comp, FileEnc, BaseEnc) of + {incomplete, Good, Bad} -> + {Good, {IoDevice, Bad}}; + {error, _, _} -> + throw({error, "bad data"}); + Good -> + {Good, {IoDevice, <<>>}} + end + end + end. + +encode_external_input(Head, FileEnc, BaseEnc, #xmerl_sax_parser_state{continuation_state = {FD, _}} = State) -> + {NewHead, NewCon} = + case unicode:characters_to_binary(Head, FileEnc, BaseEnc) of + {incomplete, Good, Bad} -> + {Good, {FD, Bad}}; + {error, _, _} -> + throw({error, "bad data"}); + Good -> + {Good, {FD, <<>>}} + end, + {NewHead, State#xmerl_sax_parser_state{continuation_state = NewCon}}. + +check_ref_cycle(#xmerl_sax_parser_state{ref_table = RefTable} = State) -> + List = maps:to_list(RefTable), + F = fun({K, {internal_general, R}}) -> + {K, get_ref_names(R)}; + ({K, _}) -> + {K, []} + end, + Mapped = lists:map(F, List), + IsCycle = lists:any(fun({K, V}) -> + check_ref_cycle(K, V, Mapped) + end, Mapped), + if + IsCycle -> + ?fatal_error(State, "Reference cycle"); + true -> + ok + end. + +check_ref_cycle(_, [], _) -> false; +check_ref_cycle(Key, Vals, List) -> + F = fun(V) -> + case lists:keyfind(V, 1, List) of + false -> + []; + {_, Vs} -> + Vs + end + end, + case lists:flatmap(F, Vals) of + [] -> + false; + Refs -> + case lists:member(Key, Refs) of + true -> + true; + false -> + check_ref_cycle(Key, Refs, List) + end + end. + +get_ref_names([$&|Rest]) -> + case get_ref_names_1(Rest, []) of + [] -> + []; + {Nm, Rest1} -> + [Nm|get_ref_names(Rest1)] + end; +get_ref_names([_|Rest]) -> + get_ref_names(Rest); +get_ref_names([]) -> []. + +get_ref_names_1([$;|Rest], Acc) -> + {lists:reverse(Acc), Rest}; +get_ref_names_1([C|Rest], Acc) -> + get_ref_names_1(Rest, [C|Acc]); +get_ref_names_1([], _) -> []. + +%%---------------------------------------------------------------------- +%% Function : strip_context(State) -> {Context, State} +%% Parameters: Tag = atom(), +%% State = xmerl_sax_parser_state() +%% Result : {Context, State} +%% Description: strips context from State before parsing entity +%%---------------------------------------------------------------------- +strip_context(#xmerl_sax_parser_state{end_tags = ET, + continuation_fun = CF} = State) -> + {{ET, CF}, State#xmerl_sax_parser_state{end_tags = [], + continuation_fun = undefined}}. +%%---------------------------------------------------------------------- +%% Function : add_context_back(Context, State) -> State +%% Parameters: Tag = atom(), +%% State = xmerl_sax_parser_state() +%% Result : State +%% Description: adds original context back to State after parsing entity +%%---------------------------------------------------------------------- +add_context_back({ET, CF}, State) -> + State#xmerl_sax_parser_state{end_tags = ET, + continuation_fun = CF}. + +%%---------------------------------------------------------------------- +%% Function: detect_charset(Xml, State) +%% Input: Xml = list() | binary() +%% State = #xmerl_sax_parser_state{} +%% Output: {utf8|utf16le|utf16be, Xml, State} +%% Description: Detects which character set is used in a binary stream. +%%---------------------------------------------------------------------- +detect_charset(State) -> + case catch cf(<<>>, State, fun detect_charset/2) of + {fatal_error, {State1, "No more bytes"}} -> + {<<>>, State1}; + Other -> + Other + end. + +detect_charset(<<>>, State) -> + {<<>>, State}; +detect_charset(<<16#00>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#00, 16#00>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#00, 16#00, 16#FE>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#EF>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#EF, 16#BB>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#FE>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#FF>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#FF, 16#FE>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#FF, 16#FE, 16#00>> = Bytes, State) -> + cf(Bytes, State, fun detect_charset/2); +detect_charset(<<16#00, 16#3C, 16#00, 16#3F, _/binary>> = Xml, State) -> + {Xml, State#xmerl_sax_parser_state{encoding={utf16, big}}}; +detect_charset(<<16#3C, 16#00, 16#3F, 16#00, _/binary>> = Xml, State) -> + {Xml, State#xmerl_sax_parser_state{encoding={utf16, little}}}; +detect_charset(Bytes, State) -> + case unicode:bom_to_encoding(Bytes) of + {latin1, 0} -> + {Bytes, State#xmerl_sax_parser_state{encoding=utf8}}; + {Enc, Length} -> + <<_:Length/binary, RealBytes/binary>> = Bytes, + {RealBytes, State#xmerl_sax_parser_state{encoding=Enc}} + end. + -- 2.16.4
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor